library(ArchR)
## 
##                                                    / |
##                                                  /    \
##             .                                  /      |.
##             \\\                              /        |.
##               \\\                          /           `|.
##                 \\\                      /              |.
##                   \                    /                |\
##                   \\#####\           /                  ||
##                 ==###########>      /                   ||
##                  \\##==......\    /                     ||
##             ______ =       =|__ /__                     ||      \\\
##         ,--' ,----`-,__ ___/'  --,-`-===================##========>
##        \               '        ##_______ _____ ,--,__,=##,__   ///
##         ,    __==    ___,-,__,--'#'  ==='      `-'    | ##,-/
##         -,____,---'       \\####\\________________,--\\_##,/
##            ___      .______        ______  __    __  .______      
##           /   \     |   _  \      /      ||  |  |  | |   _  \     
##          /  ^  \    |  |_)  |    |  ,----'|  |__|  | |  |_)  |    
##         /  /_\  \   |      /     |  |     |   __   | |      /     
##        /  _____  \  |  |\  \\___ |  `----.|  |  |  | |  |\  \\___.
##       /__/     \__\ | _| `._____| \______||__|  |__| | _| `._____|
## 
## ArchR : Version 1.0.2
## For more information see our website : www.ArchRProject.com
## If you encounter a bug please report : https://github.com/GreenleafLab/ArchR/issues
## Loading Required Packages...
##  Loading Package : grid v4.2.1
##  Loading Package : gridExtra v2.3
##  Loading Package : gtools v3.9.4
##  Loading Package : gtable v0.3.1
##  Loading Package : ggplot2 v3.4.0.9000
##  Loading Package : magrittr v2.0.3
##  Loading Package : plyr v1.8.8
##  Loading Package : stringr v1.5.0
##  Loading Package : data.table v1.14.6
##  Loading Package : matrixStats v0.63.0
##  Loading Package : S4Vectors v0.36.1
## Warning: package 'S4Vectors' was built under R version 4.2.2
##  Loading Package : GenomicRanges v1.48.0
##  Loading Package : BiocGenerics v0.44.0
##  Loading Package : Matrix v1.5.3
##  Loading Package : Rcpp v1.0.9
##  Loading Package : SummarizedExperiment v1.26.1
## Warning: multiple methods tables found for 'aperm'
## Warning: replacing previous import 'BiocGenerics::aperm' by
## 'DelayedArray::aperm' when loading 'SummarizedExperiment'
##  Loading Package : rhdf5 v2.40.0
library(BSgenome.Hsapiens.UCSC.hg19)
## Loading required package: BSgenome
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'XVector'
## The following object is masked from 'package:plyr':
## 
##     compact
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:grid':
## 
##     pattern
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: rtracklayer

Create Arrow files

addArchRThreads(threads = 8) 
## Setting default number of Parallel threads to 8.
inputFiles <- getTutorialData("Hematopoiesis")
## Downloading files to HemeFragments...
inputFiles
##                                       scATAC_BMMC_R1 
##      "HemeFragments/scATAC_BMMC_R1.fragments.tsv.gz" 
##                                  scATAC_CD34_BMMC_R1 
## "HemeFragments/scATAC_CD34_BMMC_R1.fragments.tsv.gz" 
##                                       scATAC_PBMC_R1 
##      "HemeFragments/scATAC_PBMC_R1.fragments.tsv.gz"
addArchRGenome("hg19")
## Setting default genome to Hg19.
ArrowFiles <- createArrowFiles(
  inputFiles = inputFiles,
  sampleNames = names(inputFiles),
  filterTSS = 4, #Dont set this too high because you can always increase later
  filterFrags = 1000, 
  addTileMat = TRUE,
  addGeneScoreMat = TRUE
)
## filterFrags is no longer a valid input. Please use minFrags! Setting filterFrags value to minFrags!
## filterTSS is no longer a valid input. Please use minTSS! Setting filterTSS value to minTSS!
## Using GeneAnnotation set by addArchRGenome(Hg19)!
## Using GeneAnnotation set by addArchRGenome(Hg19)!
## ArchR logging to : ArchRLogs/ArchR-createArrows-246065dc654-Date-2023-01-15_Time-20-50-37.log
## If there is an issue, please report to github with logFile!
## Cleaning Temporary Files
## 2023-01-15 20:50:38 : Batch Execution w/ safelapply!, 0 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-createArrows-246065dc654-Date-2023-01-15_Time-20-50-37.log

Doublets detection

doubScores <- addDoubletScores(
    input = ArrowFiles,
    k = 10, #Refers to how many cells near a "pseudo-doublet" to count.
    knnMethod = "UMAP", #Refers to the embedding to use for nearest neighbor search with doublet projection.
    LSIMethod = 1
)
## ArchR logging to : ArchRLogs/ArchR-addDoubletScores-246048451d94-Date-2023-01-15_Time-20-50-38.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 20:50:38 : Batch Execution w/ safelapply!, 0 mins elapsed.
## 2023-01-15 20:50:38 : scATAC_BMMC_R1 (1 of 3) :  Computing Doublet Statistics, 0 mins elapsed.
## scATAC_BMMC_R1 (1 of 3) : UMAP Projection R^2 = 0.98404
## scATAC_BMMC_R1 (1 of 3) : UMAP Projection R^2 = 0.98404
## 2023-01-15 20:52:08 : scATAC_CD34_BMMC_R1 (2 of 3) :  Computing Doublet Statistics, 1.498 mins elapsed.
## scATAC_CD34_BMMC_R1 (2 of 3) : UMAP Projection R^2 = 0.9927
## scATAC_CD34_BMMC_R1 (2 of 3) : UMAP Projection R^2 = 0.9927
## 2023-01-15 20:53:11 : scATAC_PBMC_R1 (3 of 3) :  Computing Doublet Statistics, 2.56 mins elapsed.
## scATAC_PBMC_R1 (3 of 3) : UMAP Projection R^2 = 0.98207
## scATAC_PBMC_R1 (3 of 3) : UMAP Projection R^2 = 0.98207
## ArchR logging successful to : ArchRLogs/ArchR-addDoubletScores-246048451d94-Date-2023-01-15_Time-20-50-38.log

Create ArchRProject

projHeme1 <- ArchRProject(
  ArrowFiles = ArrowFiles, 
  outputDirectory = "HemeOutput",
  copyArrows = TRUE
)
## Using GeneAnnotation set by addArchRGenome(Hg19)!
## Using GeneAnnotation set by addArchRGenome(Hg19)!
## Validating Arrows...
## Getting SampleNames...
## 
## Copying ArrowFiles to Ouptut Directory! If you want to save disk space set copyArrows = FALSE
## 1 2 3 
## Getting Cell Metadata...
## 
## Merging Cell Metadata...
## Initializing ArchRProject...
## 
##                                                    / |
##                                                  /    \
##             .                                  /      |.
##             \\\                              /        |.
##               \\\                          /           `|.
##                 \\\                      /              |.
##                   \                    /                |\
##                   \\#####\           /                  ||
##                 ==###########>      /                   ||
##                  \\##==......\    /                     ||
##             ______ =       =|__ /__                     ||      \\\
##         ,--' ,----`-,__ ___/'  --,-`-===================##========>
##        \               '        ##_______ _____ ,--,__,=##,__   ///
##         ,    __==    ___,-,__,--'#'  ==='      `-'    | ##,-/
##         -,____,---'       \\####\\________________,--\\_##,/
##            ___      .______        ______  __    __  .______      
##           /   \     |   _  \      /      ||  |  |  | |   _  \     
##          /  ^  \    |  |_)  |    |  ,----'|  |__|  | |  |_)  |    
##         /  /_\  \   |      /     |  |     |   __   | |      /     
##        /  _____  \  |  |\  \\___ |  `----.|  |  |  | |  |\  \\___.
##       /__/     \__\ | _| `._____| \______||__|  |__| | _| `._____|
## 
getAvailableMatrices(projHeme1)
## [1] "GeneScoreMatrix" "TileMatrix"
head(projHeme1$cellNames)
## [1] "scATAC_BMMC_R1#TTATGTCAGTGATTAG-1" "scATAC_BMMC_R1#AAGATAGTCACCGCGA-1"
## [3] "scATAC_BMMC_R1#GCATTGAAGATTCCGT-1" "scATAC_BMMC_R1#TATGTTCAGGGTTCCC-1"
## [5] "scATAC_BMMC_R1#TCCATCGGTCCCGTGA-1" "scATAC_BMMC_R1#AGTTACGAGAACGTCG-1"
table(projHeme1$Sample)
## 
##      scATAC_BMMC_R1 scATAC_CD34_BMMC_R1      scATAC_PBMC_R1 
##                4932                3275                2453
quantile(projHeme1$TSSEnrichment)
##       0%      25%      50%      75%     100% 
##  4.10900 13.92550 16.81500 19.93025 41.98000
QC_df <- getCellColData(projHeme1, select = c("log10(nFrags)", "TSSEnrichment"))
QC_df
## DataFrame with 10660 rows and 2 columns
##                                   log10(nFrags) TSSEnrichment
##                                       <numeric>     <numeric>
## scATAC_BMMC_R1#TTATGTCAGTGATTAG-1       4.41812         7.204
## scATAC_BMMC_R1#AAGATAGTCACCGCGA-1       4.31488         7.949
## scATAC_BMMC_R1#GCATTGAAGATTCCGT-1       4.27852         4.447
## scATAC_BMMC_R1#TATGTTCAGGGTTCCC-1       4.26236         6.941
## scATAC_BMMC_R1#TCCATCGGTCCCGTGA-1       4.24199         4.771
## ...                                         ...           ...
## scATAC_PBMC_R1#GCTGCGAAGATCCGAG-1       3.01620        24.257
## scATAC_PBMC_R1#GCAGCTGGTGGCCTTG-1       3.01578        22.537
## scATAC_PBMC_R1#GCAGATTGTACGCAAG-1       3.01410        19.888
## scATAC_PBMC_R1#TTCGTTACATTGAACC-1       3.01410        30.000
## scATAC_PBMC_R1#CGCTATCGTGAGGTCA-1       3.00087        21.287
class(QC_df)
## [1] "DFrame"
## attr(,"package")
## [1] "S4Vectors"

Data QC

ggPoint(
    x = QC_df[,1], 
    y = QC_df[,2], 
    colorDensity = TRUE,
    continuousSet = "sambaNight",
    xlabel = "Log10 Unique Fragments",
    ylabel = "TSS Enrichment",
    xlim = c(log10(500), quantile(QC_df[,1], probs = 0.99)),
    ylim = c(0, quantile(QC_df[,2], probs = 0.99))
) + geom_hline(yintercept = 4, lty = "dashed") + geom_vline(xintercept = 3, lty = "dashed")

QC for each sample

p1=plotGroups(
    ArchRProj = projHeme1, 
    groupBy = "Sample", 
    colorBy = "cellColData", 
    name = "TSSEnrichment",
    plotAs = "violin",
    alpha = 0.4,
    addBoxPlot = TRUE
   )
## 1
p2=plotGroups(
    ArchRProj = projHeme1, 
    groupBy = "Sample", 
    colorBy = "cellColData", 
    name = "log10(nFrags)",
    plotAs = "violin",
    alpha = 0.4,
    addBoxPlot = TRUE
   )
## 1
p1+p2

p1 <- plotFragmentSizes(ArchRProj = projHeme1)
## ArchR logging to : ArchRLogs/ArchR-plotFragmentSizes-246020f871fd-Date-2023-01-15_Time-20-54-16.log
## If there is an issue, please report to github with logFile!
## ArchR logging successful to : ArchRLogs/ArchR-plotFragmentSizes-246020f871fd-Date-2023-01-15_Time-20-54-16.log
p2 <- plotTSSEnrichment(ArchRProj = projHeme1)
## ArchR logging to : ArchRLogs/ArchR-plotTSSEnrichment-24601703a9f4-Date-2023-01-15_Time-20-54-31.log
## If there is an issue, please report to github with logFile!
## ArchR logging successful to : ArchRLogs/ArchR-plotTSSEnrichment-24601703a9f4-Date-2023-01-15_Time-20-54-31.log
p1+p2

#saveArchRProject(ArchRProj = projHeme1, outputDirectory = "Save-ProjHeme1", load = FALSE)

Filter Doublets

projHeme2 <- filterDoublets(projHeme1)
## Filtering 410 cells from ArchRProject!
##  scATAC_BMMC_R1 : 243 of 4932 (4.9%)
##  scATAC_CD34_BMMC_R1 : 107 of 3275 (3.3%)
##  scATAC_PBMC_R1 : 60 of 2453 (2.4%)

Dimension reduction

projHeme2 <- addIterativeLSI(
    ArchRProj = projHeme2,
    useMatrix = "TileMatrix", 
    name = "IterativeLSI", 
    iterations = 2, 
    clusterParams = list( #See Seurat::FindClusters
        resolution = c(0.2), 
        sampleCells = 10000, 
        n.start = 10
    ), 
    varFeatures = 25000, 
    dimsToUse = 1:30
)
## Checking Inputs...
## ArchR logging to : ArchRLogs/ArchR-addIterativeLSI-24606ea38868-Date-2023-01-15_Time-20-54-57.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 20:54:57 : Computing Total Across All Features, 0.001 mins elapsed.
## 2023-01-15 20:54:59 : Computing Top Features, 0.023 mins elapsed.
## ###########
## 2023-01-15 20:55:00 : Running LSI (1 of 2) on Top Features, 0.042 mins elapsed.
## ###########
## 2023-01-15 20:55:00 : Sampling Cells (N = 10001) for Estimated LSI, 0.043 mins elapsed.
## 2023-01-15 20:55:00 : Creating Sampled Partial Matrix, 0.043 mins elapsed.
## 2023-01-15 20:55:05 : Computing Estimated LSI (projectAll = FALSE), 0.133 mins elapsed.
## 2023-01-15 20:55:32 : Identifying Clusters, 0.585 mins elapsed.
## 2023-01-15 20:55:45 : Identified 6 Clusters, 0.788 mins elapsed.
## 2023-01-15 20:55:45 : Saving LSI Iteration, 0.788 mins elapsed.
## 2023-01-15 20:55:56 : Creating Cluster Matrix on the total Group Features, 0.986 mins elapsed.
## 2023-01-15 20:56:08 : Computing Variable Features, 1.186 mins elapsed.
## ###########
## 2023-01-15 20:56:09 : Running LSI (2 of 2) on Variable Features, 1.187 mins elapsed.
## ###########
## 2023-01-15 20:56:09 : Creating Partial Matrix, 1.188 mins elapsed.
## 2023-01-15 20:56:14 : Computing LSI, 1.276 mins elapsed.
## 2023-01-15 20:56:47 : Finished Running IterativeLSI, 1.821 mins elapsed.
projHeme2 <- addHarmony(
    ArchRProj = projHeme2,
    reducedDims = "IterativeLSI",
    name = "Harmony",
    groupBy = "Sample"
)
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony converged after 3 iterations

Clustering

projHeme2 <- addClusters(
    input = projHeme2,
    reducedDims = "IterativeLSI",
    method = "Seurat",
    name = "Clusters",
    resolution = 0.8
)
## ArchR logging to : ArchRLogs/ArchR-addClusters-24605c94d3dc-Date-2023-01-15_Time-20-56-54.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 20:56:54 : Running Seurats FindClusters (Stuart et al. Cell 2019), 0.001 mins elapsed.
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 10250
## Number of edges: 456227
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8670
## Number of communities: 12
## Elapsed time: 0 seconds
## 2023-01-15 20:57:06 : Testing Outlier Clusters, 0.2 mins elapsed.
## 2023-01-15 20:57:06 : Assigning Cluster Names to 12 Clusters, 0.2 mins elapsed.
## 2023-01-15 20:57:06 : Finished addClusters, 0.201 mins elapsed.
table(projHeme2$Clusters)
## 
##   C1  C10  C11  C12   C2   C3   C4   C5   C6   C7   C8   C9 
## 1454  357  307  385 1116  899 1140 1389 1242  815  726  420
projHeme2 <- addClusters(
    input = projHeme2,
    reducedDims = "Harmony",
    method = "Seurat",
    name = "HarmonyClusters",
    resolution = 0.8
)
## ArchR logging to : ArchRLogs/ArchR-addClusters-246030b7de2d-Date-2023-01-15_Time-20-57-06.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 20:57:06 : Running Seurats FindClusters (Stuart et al. Cell 2019), 0.004 mins elapsed.
## Computing nearest neighbor graph
## Computing SNN
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 10250
## Number of edges: 431341
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8587
## Number of communities: 13
## Elapsed time: 0 seconds
## 2023-01-15 20:57:18 : Testing Outlier Clusters, 0.196 mins elapsed.
## 2023-01-15 20:57:18 : Assigning Cluster Names to 13 Clusters, 0.196 mins elapsed.
## 2023-01-15 20:57:18 : Finished addClusters, 0.197 mins elapsed.
table(projHeme2$HarmonyClusters)
## 
##   C1  C10  C11  C12  C13   C2   C3   C4   C5   C6   C7   C8   C9 
##  678  145  448  387  305  991 1242  564 2473 1061 1278  536  142

Embeddings

projHeme2 <- addUMAP(
    ArchRProj = projHeme2, 
    reducedDims = "IterativeLSI", 
    name = "UMAP", 
    nNeighbors = 30, 
    minDist = 0.5, 
    metric = "cosine"
)
## 20:57:18 UMAP embedding parameters a = 0.583 b = 1.334
## 20:57:18 Read 10250 rows and found 30 numeric columns
## 20:57:18 Using Annoy for neighbor search, n_neighbors = 30
## 20:57:18 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 20:57:19 Writing NN index file to temp file /var/folders/2_/g44wy4692nx5kdrctq7f1vk899whll/T//RtmpaItRA1/file24602c1c5f70
## 20:57:19 Searching Annoy index using 8 threads, search_k = 3000
## 20:57:19 Annoy recall = 100%
## 20:57:20 Commencing smooth kNN distance calibration using 8 threads with target n_neighbors = 30
## 20:57:21 Initializing from normalized Laplacian + noise (using irlba)
## 20:57:21 Commencing optimization for 200 epochs, with 467090 positive edges
## 20:57:26 Optimization finished
## 20:57:26 Creating temp model dir /var/folders/2_/g44wy4692nx5kdrctq7f1vk899whll/T//RtmpaItRA1/dir246041c2981d
## 20:57:26 Creating dir /var/folders/2_/g44wy4692nx5kdrctq7f1vk899whll/T//RtmpaItRA1/dir246041c2981d
## 20:57:27 Changing to /var/folders/2_/g44wy4692nx5kdrctq7f1vk899whll/T//RtmpaItRA1/dir246041c2981d
## 20:57:27 Creating /Users/nzhang/Desktop/ArchR/HemeOutput/Embeddings/Save-Uwot-UMAP-Params-IterativeLSI-24607abdc8af-Date-2023-01-15_Time-20-57-26.tar
## Warning: invalid uid value replaced by that for user 'nobody'
## Warning: invalid gid value replaced by that for user 'nobody'
projHeme2 <- addUMAP(
    ArchRProj = projHeme2, 
    reducedDims = "Harmony", 
    name = "UMAPHarmony", 
    nNeighbors = 30, 
    minDist = 0.5, 
    metric = "cosine"
)
## 20:57:27 UMAP embedding parameters a = 0.583 b = 1.334
## 20:57:27 Read 10250 rows and found 30 numeric columns
## 20:57:27 Using Annoy for neighbor search, n_neighbors = 30
## 20:57:27 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 20:57:28 Writing NN index file to temp file /var/folders/2_/g44wy4692nx5kdrctq7f1vk899whll/T//RtmpaItRA1/file24607f8c27ee
## 20:57:28 Searching Annoy index using 8 threads, search_k = 3000
## 20:57:28 Annoy recall = 100%
## 20:57:29 Commencing smooth kNN distance calibration using 8 threads with target n_neighbors = 30
## 20:57:30 Initializing from normalized Laplacian + noise (using irlba)
## 20:57:30 Commencing optimization for 200 epochs, with 468468 positive edges
## 20:57:35 Optimization finished
## 20:57:35 Creating temp model dir /var/folders/2_/g44wy4692nx5kdrctq7f1vk899whll/T//RtmpaItRA1/dir246025b8811a
## 20:57:35 Creating dir /var/folders/2_/g44wy4692nx5kdrctq7f1vk899whll/T//RtmpaItRA1/dir246025b8811a
## 20:57:36 Changing to /var/folders/2_/g44wy4692nx5kdrctq7f1vk899whll/T//RtmpaItRA1/dir246025b8811a
## 20:57:36 Creating /Users/nzhang/Desktop/ArchR/HemeOutput/Embeddings/Save-Uwot-UMAP-Params-Harmony-24604a91bbad-Date-2023-01-15_Time-20-57-35.tar
## Warning: invalid uid value replaced by that for user 'nobody'
## Warning: invalid gid value replaced by that for user 'nobody'
p1 <- plotEmbedding(ArchRProj = projHeme2, colorBy = "cellColData", name = "Sample", embedding = "UMAP")
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-246060a39f73-Date-2023-01-15_Time-20-57-36.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = cellColData
## Plotting Embedding
## 1 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-246060a39f73-Date-2023-01-15_Time-20-57-36.log
p2 <- plotEmbedding(ArchRProj = projHeme2, colorBy = "cellColData", name = "Clusters", embedding = "UMAP")
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-246016396896-Date-2023-01-15_Time-20-57-37.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = cellColData
## Plotting Embedding
## 1 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-246016396896-Date-2023-01-15_Time-20-57-37.log
p1+p2

p3 <- plotEmbedding(ArchRProj = projHeme2, colorBy = "cellColData", name = "Sample", embedding = "UMAPHarmony")
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-24603e0dd07f-Date-2023-01-15_Time-20-57-39.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = cellColData
## Plotting Embedding
## 1 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-24603e0dd07f-Date-2023-01-15_Time-20-57-39.log
p4 <- plotEmbedding(ArchRProj = projHeme2, colorBy = "cellColData", name = "Clusters", embedding = "UMAPHarmony")
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-24607cf361ac-Date-2023-01-15_Time-20-57-39.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = cellColData
## Plotting Embedding
## 1 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-24607cf361ac-Date-2023-01-15_Time-20-57-39.log
p3+p4

p5 <- plotEmbedding(ArchRProj = projHeme2, colorBy = "cellColData", name = "Sample", embedding = "UMAPHarmony")
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-24603b777792-Date-2023-01-15_Time-20-57-42.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = cellColData
## Plotting Embedding
## 1 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-24603b777792-Date-2023-01-15_Time-20-57-42.log
p6 <- plotEmbedding(ArchRProj = projHeme2, colorBy = "cellColData", name = "HarmonyClusters", embedding = "UMAPHarmony")
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-2460204b30be-Date-2023-01-15_Time-20-57-42.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = cellColData
## Plotting Embedding
## 1 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-2460204b30be-Date-2023-01-15_Time-20-57-42.log
p5+p6

# Identify marker genes

markersGS <- getMarkerFeatures(
    ArchRProj = projHeme2, 
    useMatrix = "GeneScoreMatrix", 
    groupBy = "Clusters",
    bias = c("TSSEnrichment", "log10(nFrags)"),
    testMethod = "wilcoxon"
)
## ArchR logging to : ArchRLogs/ArchR-getMarkerFeatures-246012e7618a-Date-2023-01-15_Time-20-57-45.log
## If there is an issue, please report to github with logFile!
## MatrixClass = Sparse.Double.Matrix
## 2023-01-15 20:57:45 : Matching Known Biases, 0.001 mins elapsed.
## ###########
## 2023-01-15 20:58:52 : Completed Pairwise Tests, 1.122 mins elapsed.
## ###########
## ArchR logging successful to : ArchRLogs/ArchR-getMarkerFeatures-246012e7618a-Date-2023-01-15_Time-20-57-45.log
markerList <- getMarkers(markersGS, cutOff = "FDR <= 0.01 & Log2FC >= 1.25")
markerList$C12
## DataFrame with 288 rows and 9 columns
##       seqnames     start       end  strand    name     idx    Log2FC
##          <Rle>   <array>   <array> <array> <array> <array> <numeric>
## 8362     chr16  49891830  49524515       2  ZNF423     539   1.34005
## 7024     chr14 105781914 105675623       2    BRF1     763   1.44923
## 7025     chr14 105714879 105717430       1   BTBD6     764   1.89012
## 16344     chr4   5894785   5822491       2   CRMP1      82   2.74747
## 8726     chr16  89043504  88941263       2 CBFA2T3     903   1.52691
## ...        ...       ...       ...     ...     ...     ...       ...
## 3106     chr10 123748689 124014057       1   TACC2     793   1.67167
## 14586    chr22  24032961  24041363       1    RGL4     128   1.96395
## 2053      chr1 217311097 216676588       2   ESRRG    2053   1.52471
## 11662    chr19  50004125  50004042       2  MIR150    1210   1.44071
## 7297     chr15  43882451  43825660       2 PPIP5K1     260   3.17624
##               FDR  MeanDiff
##         <numeric> <numeric>
## 8362  1.16473e-17  1.674123
## 7024  1.88240e-17  2.029502
## 7025  7.70920e-15  2.146752
## 16344 7.70920e-15  0.936475
## 8726  2.98559e-14  1.916347
## ...           ...       ...
## 3106   0.00892412 0.2822542
## 14586  0.00945525 0.7592742
## 2053   0.00948083 0.3883958
## 11662  0.00975978 1.2129899
## 7297   0.00977352 0.0806601
markerGenes  <- c(
    "CD34", #Early Progenitor
    "GATA1", #Erythroid
    "PAX5", "MS4A1", "EBF1", "MME", #B-Cell Trajectory
    "CD14", "CEBPB", "MPO", #Monocytes
    "IRF8", 
    "CD3D", "CD8A", "TBX21", "IL7R" #TCells
  )

heatmapGS <- markerHeatmap(
  seMarker = markersGS, 
  cutOff = "FDR <= 0.01 & Log2FC >= 1.25", 
  labelMarkers = markerGenes,
  transpose = TRUE
)
## Warning: 'markerHeatmap' is deprecated.
## Use 'plotMarkerHeatmap' instead.
## See help("Deprecated")
## ArchR logging to : ArchRLogs/ArchR-plotMarkerHeatmap-246035c60dd2-Date-2023-01-15_Time-20-58-53.log
## If there is an issue, please report to github with logFile!
## Printing Top Marker Genes:
## C1:
##  RBP7, PADI6, CDA, LINC01226, WLS, SRGAP2D, RPTN, MNDA, FCGR2C, FCGR2B, RNASEL, FCAMR, MIR604, UBE2D1, MIR4679-2
## C2:
##  PERM1, ARHGEF10L, FGR, PPIEL, MYCL, TMCO2, BTBD19, TCTEX1D4, SLC5A9, L1TD1, RPE65, GPR88, MYBPHL, GNAT2, CASQ2
## C3:
##  LINC01777, AJAP1, NPPA-AS1, CLCNKB, KLHDC7A, PAX7, RNF186, UBXN10, FAM43B, ALPL, WNT4, EPHA8, MIR378F, MYOM3, TRIM63
## C4:
##  ANGPTL1, DEPP1, RBP3, OPN4, SLIT1-AS1, KRTAP5-9, CCNA1, PCDH8, LINC00446, MIR3529, MPO, LAMA1, BOD1L2, PRTN3, ELANE
## C5:
##  LINC02593, PRDM16, MFAP2, PADI3, HSPG2, MIR4253, CATSPER4, C1orf94, GJB5, ELAVL4, GLIS1, ACOT11, SGIP1, ST6GALNAC5, SLC6A17
## C6:
##  SNORA59B, RCAN3AS, RCAN3, ORC1, S1PR1, FNDC7, AMIGO1, CHI3L2, HIST2H2BC, CIART, MIR557, GATA3-AS1, RTKN2, JAKMIP3, PDE3B
## C7:
##  TNFRSF25, HSD17B7, GAS5-AS1, BNIP3, PGGHG, PTPRCAP, SNORA8, RPL13P5, MCRS1, PIP4K2C, SNORD116-3, IPW, EMC4, GCHFR, TELO2
## C8:
##  TNFRSF9, RUNX3, LCK, LEXM, IL12RB2, GBP2, GBP5, LOC729930, VCAM1, CHIAP2, BCL2L15, CD160, PDZK1, CDC42SE1, SH2D2A
## C9:
##  LINC01346, PRDM2, CNR2, SHISAL2A, MFSD14A, SLC16A4, PIFO, VANGL2, SLAMF6, KIAA0040, C1orf220, MIR4424, RALGPS2, RGS1, RGS13
## C10:
##  CALML6, PLCH2, MAD2L2, MDS2, SLFNL1, BTF3L4, ROR1, KLHDC8A, ZNF22, ZNF488, SLC16A9, AP3M1, MIR346, NUDT9P1, PIK3AP1
## C11:
##  ANKRD65, TMEM88B, PLA2G5, PLA2G2D, NCMAP, LIN28A, MAP7D1, SCMH1, TTC39A, GBP1P1, HAO2, NUDT17, RORC, CRNN, LCE5A
## C12:
##  MIR551A, SLC2A7, HCRTR1, CSMD2, TFAP2E, KLF17, AGBL4, ERICH3, LOC729970, NTNG1, UCK2, MYOC, MIR5191, ESRRG, MIR5008
## Identified 2615 markers!
##  [1] "CD34"  "GATA1" "PAX5"  "MS4A1" "EBF1"  "MME"   "CD14"  "CEBPB" "MPO"  
## [10] "IRF8"  "CD3D"  "CD8A"  "TBX21" "IL7R"
## Adding Annotations..
## Preparing Main Heatmap..
## 'magick' package is suggested to install to give better rasterization.
## 
## Set `ht_opt$message = FALSE` to turn off this message.
## ArchR logging successful to : ArchRLogs/ArchR-plotMarkerHeatmap-246035c60dd2-Date-2023-01-15_Time-20-58-53.log
ComplexHeatmap::draw(heatmapGS, heatmap_legend_side = "bot", annotation_legend_side = "bot")

markerGenes  <- c(
    "CD34",  #Early Progenitor
    "GATA1", #Erythroid
    "PAX5", "MS4A1", "MME", #B-Cell Trajectory
    "CD14", "MPO", #Monocytes
    "CD3D", "CD8A"#TCells
  )

p <- plotEmbedding(
    ArchRProj = projHeme2, 
    colorBy = "GeneScoreMatrix", 
    name = markerGenes, 
    embedding = "UMAP",
    quantCut = c(0.01, 0.95),
    imputeWeights = NULL
)
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-2460d47e9e3-Date-2023-01-15_Time-20-58-56.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = GeneScoreMatrix
## Getting Matrix Values...
## 2023-01-15 20:58:56 :
## 
## Plotting Embedding
## 1 2 3 4 5 6 7 8 9 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-2460d47e9e3-Date-2023-01-15_Time-20-58-56.log
p2 <- lapply(p, function(x){
    x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
        axis.text.x=element_blank(), 
        axis.ticks.x=element_blank(), 
        axis.text.y=element_blank(), 
        axis.ticks.y=element_blank()
    )
})
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
do.call(cowplot::plot_grid, c(list(ncol = 3),p2))

# Impute gene scores

projHeme2 <- addImputeWeights(projHeme2)
## ArchR logging to : ArchRLogs/ArchR-addImputeWeights-24606e67c394-Date-2023-01-15_Time-20-59-05.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 20:59:05 : Computing Impute Weights Using Magic (Cell 2018), 0 mins elapsed.
p <- plotEmbedding(
    ArchRProj = projHeme2, 
    colorBy = "GeneScoreMatrix", 
    name = markerGenes, 
    embedding = "UMAP",
    imputeWeights = getImputeWeights(projHeme2)
)
## Getting ImputeWeights
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-24603e3c6718-Date-2023-01-15_Time-20-59-12.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = GeneScoreMatrix
## Getting Matrix Values...
## 2023-01-15 20:59:13 :
## 
## Imputing Matrix
## Using weights on disk
## Using weights on disk
## Plotting Embedding
## 1 2 3 4 5 6 7 8 9 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-24603e3c6718-Date-2023-01-15_Time-20-59-12.log
#Rearrange for grid plotting
p2 <- lapply(p, function(x){
    x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
        axis.text.x=element_blank(), 
        axis.ticks.x=element_blank(), 
        axis.text.y=element_blank(), 
        axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 3),p2))

p <- plotBrowserTrack(
    ArchRProj = projHeme2, 
    groupBy = "Clusters", 
    geneSymbol = "CD14", 
    upstream = 50000,
    downstream = 50000
)
## ArchR logging to : ArchRLogs/ArchR-plotBrowserTrack-24607c78341f-Date-2023-01-15_Time-20-59-24.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 20:59:24 : Validating Region, 0.002 mins elapsed.
## GRanges object with 1 range and 2 metadata columns:
##       seqnames              ranges strand |     gene_id      symbol
##          <Rle>           <IRanges>  <Rle> | <character> <character>
##   [1]     chr5 140011313-140013286      - |         929        CD14
##   -------
##   seqinfo: 24 sequences from hg19 genome
## 2023-01-15 20:59:24 : Adding Bulk Tracks (1 of 1), 0.003 mins elapsed.
## 2023-01-15 20:59:25 : Adding Gene Tracks (1 of 1), 0.015 mins elapsed.
## 2023-01-15 20:59:25 : Plotting, 0.02 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-plotBrowserTrack-24607c78341f-Date-2023-01-15_Time-20-59-24.log
grid::grid.newpage()
grid::grid.draw(p$CD14)

#saveArchRProject(ArchRProj = projHeme2, outputDirectory = "Save-ProjHeme2", load = FALSE)

scRNAseq integration

seRNA <- readRDS("scRNA-Hematopoiesis-Granja-2019.rds")

Unconstrained Integration

projHeme2 <- addGeneIntegrationMatrix(
    ArchRProj = projHeme2, 
    useMatrix = "GeneScoreMatrix",
    matrixName = "GeneIntegrationMatrix",
    reducedDims = "IterativeLSI",
    seRNA = seRNA,
    addToArrow = FALSE,
    groupRNA = "BioClassification",
    nameCell = "predictedCell_Un",
    nameGroup = "predictedGroup_Un",
    nameScore = "predictedScore_Un"
)
## ArchR logging to : ArchRLogs/ArchR-addGeneIntegrationMatrix-2460272b8682-Date-2023-01-15_Time-20-59-30.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 20:59:30 : Running Seurat's Integration Stuart* et al 2019, 0.001 mins elapsed.
## 2023-01-15 20:59:30 : Checking ATAC Input, 0.005 mins elapsed.
## 2023-01-15 20:59:30 : Checking RNA Input, 0.008 mins elapsed.
## 2023-01-15 20:59:36 : Found 18601 overlapping gene names from gene scores and rna matrix!, 0.113 mins elapsed.
## 2023-01-15 20:59:36 : Creating Integration Blocks, 0.113 mins elapsed.
## 2023-01-15 20:59:37 : Prepping Interation Data, 0.117 mins elapsed.
## 2023-01-15 20:59:37 : Computing Integration in 1 Integration Blocks!, 0 mins elapsed.
## 2023-01-15 20:59:37 : Block (1 of 1) : Computing Integration, 0 mins elapsed.
## 2023-01-15 20:59:39 : Block (1 of 1) : Identifying Variable Genes, 0.038 mins elapsed.
## 2023-01-15 20:59:43 : Block (1 of 1) : Getting GeneScoreMatrix, 0.092 mins elapsed.
## 2023-01-15 20:59:49 : Block (1 of 1) : Imputing GeneScoreMatrix, 0.193 mins elapsed.
## Getting ImputeWeights
## 2023-01-15 21:00:19 : Block (1 of 1) : Seurat FindTransferAnchors, 0.698 mins elapsed.
## 2023-01-15 21:05:57 : Block (1 of 1) : Seurat TransferData Cell Group Labels, 6.325 mins elapsed.
## 2023-01-15 21:05:58 : Block (1 of 1) : Seurat TransferData Cell Names Labels, 6.354 mins elapsed.
## 2023-01-15 21:06:19 : Block (1 of 1) : Saving TransferAnchors Joint CCA, 6.697 mins elapsed.
## 2023-01-15 21:06:20 : Block (1 of 1) : Completed Integration, 6.718 mins elapsed.
## 2023-01-15 21:06:21 : Block (1 of 1) : Plotting Joint UMAP, 6.729 mins elapsed.
## Length of unique values greater than palette, interpolating..
## 2023-01-15 21:06:47 : Completed Integration with RNA Matrix, 7.163 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-addGeneIntegrationMatrix-2460272b8682-Date-2023-01-15_Time-20-59-30.log
head(projHeme2$predictedGroup_Un)
## [1] "08_GMP.Neut" "22_CD4.M"    "16_Pre.B"    "06_CLP.1"    "16_Pre.B"   
## [6] "08_GMP.Neut"

Constrained Integration

# The predicted cell type for each scATAC-seq cluster
cM <- as.matrix(confusionMatrix(projHeme2$Clusters, projHeme2$predictedGroup_Un))
preClust <- colnames(cM)[apply(cM, 1 , which.max)]
cbind(preClust, rownames(cM)) #Assignments
##       preClust              
##  [1,] "17_B"           "C9" 
##  [2,] "20_CD4.N1"      "C6" 
##  [3,] "16_Pre.B"       "C10"
##  [4,] "08_GMP.Neut"    "C4" 
##  [5,] "11_CD14.Mono.1" "C1" 
##  [6,] "03_Late.Eryth"  "C3" 
##  [7,] "22_CD4.M"       "C7" 
##  [8,] "01_HSC"         "C5" 
##  [9,] "09_pDC"         "C11"
## [10,] "25_NK"          "C8" 
## [11,] "12_CD14.Mono.2" "C2" 
## [12,] "15_CLP.2"       "C12"
table(seRNA@colData$BioClassification)
## 
##         01_HSC 02_Early.Eryth  03_Late.Eryth  04_Early.Baso    05_CMP.LMPP 
##           1425           1653            446            111           2260 
##       06_CLP.1         07_GMP    08_GMP.Neut         09_pDC         10_cDC 
##            903           2097           1050            544            325 
## 11_CD14.Mono.1 12_CD14.Mono.2   13_CD16.Mono         14_Unk       15_CLP.2 
##           1800           4222            292            520            377 
##       16_Pre.B           17_B      18_Plasma       19_CD8.N      20_CD4.N1 
##            710           1711             62           1521           2470 
##      21_CD4.N2       22_CD4.M      23_CD8.EM      24_CD8.CM          25_NK 
##           2364           3539            796           2080           2143 
##         26_Unk 
##            161
#scRNA-seq 19-25 are T or NK cells
cTNK <- paste0(paste0(19:25), collapse="|")
cTNK
## [1] "19|20|21|22|23|24|25"
#scRNA-seq other cells
cNonTNK <- paste0(c(paste0("0", 1:9), 10:13, 15:18), collapse="|")
cNonTNK
## [1] "01|02|03|04|05|06|07|08|09|10|11|12|13|15|16|17|18"
#scATAC T or NK clusters
clustTNK <- rownames(cM)[grep(cTNK, preClust)]
clustTNK
## [1] "C6" "C7" "C8"
#scATAC none T or NK clusters
clustNonTNK <- rownames(cM)[grep(cNonTNK, preClust)]
clustNonTNK
## [1] "C9"  "C10" "C4"  "C1"  "C3"  "C5"  "C11" "C2"  "C12"
#scRNA T NK cell names
rnaTNK <- colnames(seRNA)[grep(cTNK, colData(seRNA)$BioClassification)]
head(rnaTNK)
## [1] "PBMC_10x_GREENLEAF_REP1:AAACCCAGTCGTCATA-1"
## [2] "PBMC_10x_GREENLEAF_REP1:AAACCCATCCGATGTA-1"
## [3] "PBMC_10x_GREENLEAF_REP1:AAACCCATCTCAACGA-1"
## [4] "PBMC_10x_GREENLEAF_REP1:AAACCCATCTCTCGAC-1"
## [5] "PBMC_10x_GREENLEAF_REP1:AAACGAACAATCGTCA-1"
## [6] "PBMC_10x_GREENLEAF_REP1:AAACGAACACGATTCA-1"
rnaNonTNK <- colnames(seRNA)[grep(cNonTNK, colData(seRNA)$BioClassification)]
head(rnaNonTNK)
## [1] "CD34_32_R5:AAACCTGAGTATCGAA-1" "CD34_32_R5:AAACCTGAGTCGTTTG-1"
## [3] "CD34_32_R5:AAACCTGGTTCCACAA-1" "CD34_32_R5:AAACGGGAGCTTCGCG-1"
## [5] "CD34_32_R5:AAACGGGAGGGAGTAA-1" "CD34_32_R5:AAACGGGAGTTACGGG-1"
groupList <- SimpleList(
    TNK = SimpleList(
        ATAC = projHeme2$cellNames[projHeme2$Clusters %in% clustTNK],
        RNA = rnaTNK
    ),
    NonTNK = SimpleList(
        ATAC = projHeme2$cellNames[projHeme2$Clusters %in% clustNonTNK],
        RNA = rnaNonTNK
    )    
)
projHeme2 <- addGeneIntegrationMatrix(
    ArchRProj = projHeme2, 
    useMatrix = "GeneScoreMatrix",
    matrixName = "GeneIntegrationMatrix",
    reducedDims = "IterativeLSI",
    seRNA = seRNA,
    addToArrow = FALSE, 
    groupList = groupList,
    groupRNA = "BioClassification",
    nameCell = "predictedCell_Co",
    nameGroup = "predictedGroup_Co",
    nameScore = "predictedScore_Co"
)
## ArchR logging to : ArchRLogs/ArchR-addGeneIntegrationMatrix-24605ff5ff5c-Date-2023-01-15_Time-21-06-47.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:06:47 : Running Seurat's Integration Stuart* et al 2019, 0.001 mins elapsed.
## 2023-01-15 21:06:48 : Checking ATAC Input, 0.005 mins elapsed.
## 2023-01-15 21:06:48 : Checking RNA Input, 0.008 mins elapsed.
## 2023-01-15 21:06:54 : Found 18601 overlapping gene names from gene scores and rna matrix!, 0.105 mins elapsed.
## 2023-01-15 21:06:54 : Creating Integration Blocks, 0.105 mins elapsed.
## 2023-01-15 21:06:54 : Prepping Interation Data, 0.108 mins elapsed.
## 2023-01-15 21:06:54 : Computing Integration in 2 Integration Blocks!, 0 mins elapsed.
## 2023-01-15 21:12:15 : Block (1 of 2) : Plotting Joint UMAP, 5.339 mins elapsed.
## 2023-01-15 21:12:32 : Block (2 of 2) : Plotting Joint UMAP, 5.627 mins elapsed.
## 2023-01-15 21:12:55 : Completed Integration with RNA Matrix, 6.013 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-addGeneIntegrationMatrix-24605ff5ff5c-Date-2023-01-15_Time-21-06-47.log
pal <- paletteDiscrete(values = colData(seRNA)$BioClassification)
## Length of unique values greater than palette, interpolating..
pal
##         01_HSC 02_Early.Eryth  03_Late.Eryth  04_Early.Baso    05_CMP.LMPP 
##      "#D51F26"      "#502A59"      "#235D55"      "#3D6E57"      "#8D2B8B" 
##       06_CLP.1         07_GMP    08_GMP.Neut         09_pDC         10_cDC 
##      "#DE6C3E"      "#F9B712"      "#D8CE42"      "#8E9ACD"      "#B774B1" 
## 11_CD14.Mono.1 12_CD14.Mono.2   13_CD16.Mono         14_Unk       15_CLP.2 
##      "#D69FC8"      "#C7C8DE"      "#8FD3D4"      "#89C86E"      "#CC9672" 
##       16_Pre.B           17_B      18_Plasma       19_CD8.N      20_CD4.N1 
##      "#CF7E96"      "#A27AA4"      "#CD4F32"      "#6B977E"      "#518AA3" 
##      21_CD4.N2       22_CD4.M      23_CD8.EM      24_CD8.CM          25_NK 
##      "#5A5297"      "#0F707D"      "#5E2E32"      "#A95A3C"      "#B28D5C" 
##         26_Unk 
##      "#3D3D3D"
p1 <- plotEmbedding(
    projHeme2, 
    colorBy = "cellColData", 
    name = "predictedGroup_Un", 
    pal = pal
)
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-2460699ce87c-Date-2023-01-15_Time-21-12-55.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = cellColData
## Plotting Embedding
## 1 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-2460699ce87c-Date-2023-01-15_Time-21-12-55.log
p2 <- plotEmbedding(
    projHeme2, 
    colorBy = "cellColData", 
    name = "predictedGroup_Co", 
    pal = pal
)
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-24603c635b0f-Date-2023-01-15_Time-21-12-56.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = cellColData
## Plotting Embedding
## 1 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-24603c635b0f-Date-2023-01-15_Time-21-12-56.log
p1+p2

projHeme3 <- addGeneIntegrationMatrix(
    ArchRProj = projHeme2, 
    useMatrix = "GeneScoreMatrix",
    matrixName = "GeneIntegrationMatrix",
    reducedDims = "IterativeLSI",
    seRNA = seRNA,
    addToArrow = TRUE,
    force= TRUE,
    groupList = groupList,
    groupRNA = "BioClassification",
    nameCell = "predictedCell",
    nameGroup = "predictedGroup",
    nameScore = "predictedScore"
)
## ArchR logging to : ArchRLogs/ArchR-addGeneIntegrationMatrix-246072a59c6b-Date-2023-01-15_Time-21-13-00.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:13:00 : Running Seurat's Integration Stuart* et al 2019, 0.001 mins elapsed.
## 2023-01-15 21:13:00 : Checking ATAC Input, 0.005 mins elapsed.
## 2023-01-15 21:13:00 : Checking RNA Input, 0.007 mins elapsed.
## 2023-01-15 21:13:07 : Found 18601 overlapping gene names from gene scores and rna matrix!, 0.122 mins elapsed.
## 2023-01-15 21:13:07 : Creating Integration Blocks, 0.122 mins elapsed.
## 2023-01-15 21:13:07 : Prepping Interation Data, 0.124 mins elapsed.
## 2023-01-15 21:13:08 : Computing Integration in 2 Integration Blocks!, 0 mins elapsed.
## 2023-01-15 21:18:41 : Block (1 of 2) : Plotting Joint UMAP, 5.56 mins elapsed.
## 2023-01-15 21:18:59 : Block (2 of 2) : Plotting Joint UMAP, 5.849 mins elapsed.
## 2023-01-15 21:19:21 : Transferring Data to ArrowFiles, 6.226 mins elapsed.
## 2023-01-15 21:20:04 : Completed Integration with RNA Matrix, 6.93 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-addGeneIntegrationMatrix-246072a59c6b-Date-2023-01-15_Time-21-13-00.log
getAvailableMatrices(projHeme3)
## [1] "GeneIntegrationMatrix" "GeneScoreMatrix"       "TileMatrix"
projHeme3 <- addImputeWeights(projHeme3)
## ArchR logging to : ArchRLogs/ArchR-addImputeWeights-24606ef697a5-Date-2023-01-15_Time-21-20-04.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:20:04 : Computing Impute Weights Using Magic (Cell 2018), 0 mins elapsed.
markerGenes  <- c(
    "CD34", #Early Progenitor
    "GATA1", #Erythroid
    "PAX5", "MS4A1", #B-Cell Trajectory
    "CD14", #Monocytes
    "CD3D", "CD8A", "TBX21", "IL7R" #TCells
  )

p1 <- plotEmbedding(
    ArchRProj = projHeme3, 
    colorBy = "GeneIntegrationMatrix", 
    name = markerGenes, 
    continuousSet = "horizonExtra",
    embedding = "UMAP",
    imputeWeights = getImputeWeights(projHeme3)
)
## Getting ImputeWeights
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-24607706aea7-Date-2023-01-15_Time-21-20-12.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = GeneIntegrationMatrix
## Getting Matrix Values...
## 2023-01-15 21:20:12 :
## 
## Imputing Matrix
## Using weights on disk
## Using weights on disk
## Plotting Embedding
## 1 2 3 4 5 6 7 8 9 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-24607706aea7-Date-2023-01-15_Time-21-20-12.log
p2 <- plotEmbedding(
    ArchRProj = projHeme3, 
    colorBy = "GeneScoreMatrix", 
    continuousSet = "horizonExtra",
    name = markerGenes, 
    embedding = "UMAP",
    imputeWeights = getImputeWeights(projHeme3)
)
## Getting ImputeWeights
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-246057b492fd-Date-2023-01-15_Time-21-20-19.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = GeneScoreMatrix
## Getting Matrix Values...
## 2023-01-15 21:20:19 : 
## 
## Imputing Matrix
## Using weights on disk
## Using weights on disk
## Plotting Embedding
## 1 2 3 4 5 6 7 8 9 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-246057b492fd-Date-2023-01-15_Time-21-20-19.log
p1c <- lapply(p1, function(x){
    x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
        axis.text.x=element_blank(), 
        axis.ticks.x=element_blank(), 
        axis.text.y=element_blank(), 
        axis.ticks.y=element_blank()
    )
})

p2c <- lapply(p2, function(x){
    x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
        axis.text.x=element_blank(), 
        axis.ticks.x=element_blank(), 
        axis.text.y=element_blank(), 
        axis.ticks.y=element_blank()
    )
})

# Gene expression deduced by scRNA-seq
do.call(cowplot::plot_grid, c(list(ncol = 3), p1c))

#Gene score
do.call(cowplot::plot_grid, c(list(ncol = 3), p2c))

cM <- confusionMatrix(projHeme3$Clusters, projHeme3$predictedGroup)
labelOld <- rownames(cM)
labelOld
##  [1] "C9"  "C6"  "C10" "C4"  "C1"  "C3"  "C7"  "C5"  "C11" "C8"  "C2"  "C12"
labelNew <- colnames(cM)[apply(cM, 1, which.max)]
labelNew
##  [1] "17_B"           "20_CD4.N1"      "16_Pre.B"       "08_GMP.Neut"   
##  [5] "11_CD14.Mono.1" "03_Late.Eryth"  "22_CD4.M"       "01_HSC"        
##  [9] "09_pDC"         "25_NK"          "12_CD14.Mono.2" "15_CLP.2"
remapClust <- c(
    "01_HSC" = "Progenitor",
    "02_Early.Eryth" = "Erythroid",
    "03_Late.Eryth" = "Erythroid",
    "04_Early.Baso" = "Basophil",
    "05_CMP.LMPP" = "Progenitor",
    "06_CLP.1" = "CLP",
    "07_GMP" = "GMP",
    "08_GMP.Neut" = "GMP",
    "09_pDC" = "pDC",
    "10_cDC" = "cDC",
    "11_CD14.Mono.1" = "Mono",
    "12_CD14.Mono.2" = "Mono",
    "13_CD16.Mono" = "Mono",
    "15_CLP.2" = "CLP",
    "16_Pre.B" = "PreB",
    "17_B" = "B",
    "18_Plasma" = "Plasma",
    "19_CD8.N" = "CD8.N",
    "20_CD4.N1" = "CD4.N",
    "21_CD4.N2" = "CD4.N",
    "22_CD4.M" = "CD4.M",
    "23_CD8.EM" = "CD8.EM",
    "24_CD8.CM" = "CD8.CM",
    "25_NK" = "NK"
)
remapClust <- remapClust[names(remapClust) %in% labelNew]
labelNew2 <- mapLabels(labelNew, oldLabels = names(remapClust), newLabels = remapClust)
labelNew2
##  [1] "B"          "CD4.N"      "PreB"       "GMP"        "Mono"      
##  [6] "Erythroid"  "CD4.M"      "Progenitor" "pDC"        "NK"        
## [11] "Mono"       "CLP"
projHeme3$Clusters2 <- mapLabels(projHeme3$Clusters, newLabels = labelNew2, oldLabels = labelOld)
plotEmbedding(projHeme3, colorBy = "cellColData", name = "Clusters2")
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-24605af9df2a-Date-2023-01-15_Time-21-20-34.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = cellColData
## Plotting Embedding
## 1 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-24605af9df2a-Date-2023-01-15_Time-21-20-34.log

#saveArchRProject(ArchRProj = projHeme3, outputDirectory = "Save-ProjHeme3", load = FALSE)

Pseudo-bulk replicates

library(BSgenome.Hsapiens.UCSC.hg19, lib.loc = "/Library/Frameworks/R.framework/Versions/4.2/Resources/library")
projHeme4 <- addGroupCoverages(ArchRProj = projHeme3, groupBy = "Clusters2")
## ArchR logging to : ArchRLogs/ArchR-addGroupCoverages-2460b825611-Date-2023-01-15_Time-21-20-35.log
## If there is an issue, please report to github with logFile!
## B (1 of 11) : CellGroups N = 2
## CD4.M (2 of 11) : CellGroups N = 2
## CD4.N (3 of 11) : CellGroups N = 2
## CLP (4 of 11) : CellGroups N = 2
## Erythroid (5 of 11) : CellGroups N = 2
## GMP (6 of 11) : CellGroups N = 2
## Mono (7 of 11) : CellGroups N = 2
## NK (8 of 11) : CellGroups N = 2
## pDC (9 of 11) : CellGroups N = 2
## PreB (10 of 11) : CellGroups N = 2
## Progenitor (11 of 11) : CellGroups N = 2
## 2023-01-15 21:20:37 : Creating Coverage Files!, 0.024 mins elapsed.
## 2023-01-15 21:20:37 : Batch Execution w/ safelapply!, 0.024 mins elapsed.
## 2023-01-15 21:22:27 : Adding Kmer Bias to Coverage Files!, 1.865 mins elapsed.
## Completed Kmer Bias Calculation
## Adding Kmer Bias (1 of 22)
## Adding Kmer Bias (2 of 22)
## Adding Kmer Bias (3 of 22)
## Adding Kmer Bias (4 of 22)
## Adding Kmer Bias (5 of 22)
## Adding Kmer Bias (6 of 22)
## Adding Kmer Bias (7 of 22)
## Adding Kmer Bias (8 of 22)
## Adding Kmer Bias (9 of 22)
## Adding Kmer Bias (10 of 22)
## Adding Kmer Bias (11 of 22)
## Adding Kmer Bias (12 of 22)
## Adding Kmer Bias (13 of 22)
## Adding Kmer Bias (14 of 22)
## Adding Kmer Bias (15 of 22)
## Adding Kmer Bias (16 of 22)
## Adding Kmer Bias (17 of 22)
## Adding Kmer Bias (18 of 22)
## Adding Kmer Bias (19 of 22)
## Adding Kmer Bias (20 of 22)
## Adding Kmer Bias (21 of 22)
## Adding Kmer Bias (22 of 22)
## 2023-01-15 21:24:20 : Finished Creation of Coverage Files!, 3.742 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-addGroupCoverages-2460b825611-Date-2023-01-15_Time-21-20-35.log

Peak calling

projHeme4 <- addReproduciblePeakSet(
    ArchRProj = projHeme4, 
    groupBy = "Clusters2", 
    pathToMacs2 = "/Users/nzhang/opt/anaconda3/envs/py37/bin/macs2"
)
## ArchR logging to : ArchRLogs/ArchR-addReproduciblePeakSet-246060ec6df7-Date-2023-01-15_Time-21-24-20.log
## If there is an issue, please report to github with logFile!
## Calling Peaks with Macs2
## 2023-01-15 21:24:21 : Peak Calling Parameters!, 0.002 mins elapsed.
##                 Group nCells nCellsUsed nReplicates nMin nMax maxPeaks
## B                   B    420        415           2  164  251   150000
## CD4.M           CD4.M    815        613           2  113  500   150000
## CD4.N           CD4.N   1242        549           2   49  500   150000
## CLP               CLP    385        385           2   88  297   150000
## Erythroid   Erythroid    899        729           2  229  500   150000
## GMP               GMP   1140        840           2  340  500   150000
## Mono             Mono   2570       1000           2  500  500   150000
## NK                 NK    726        726           2  304  422   150000
## pDC               pDC    307        297           2  148  149   148500
## PreB             PreB    357        357           2   40  317   150000
## Progenitor Progenitor   1389        636           2  136  500   150000
## 2023-01-15 21:24:21 : Batching Peak Calls!, 0.002 mins elapsed.
## 2023-01-15 21:24:21 : Batch Execution w/ safelapply!, 0 mins elapsed.
## 2023-01-15 21:36:14 : Identifying Reproducible Peaks!, 11.892 mins elapsed.
## 2023-01-15 21:36:24 : Creating Union Peak Set!, 12.057 mins elapsed.
## Converged after 6 iterations!
## Plotting Ggplot!
## 2023-01-15 21:36:30 : Finished Creating Union Peak Set (143229)!, 12.153 mins elapsed.
getPeakSet(projHeme4)
## GRanges object with 143229 ranges and 13 metadata columns:
##             seqnames              ranges strand |     score
##                <Rle>           <IRanges>  <Rle> | <numeric>
##        Mono     chr1       752472-752972      * |  25.48294
##           B     chr1       762688-763188      * |  51.64833
##           B     chr1       801007-801507      * |  15.77857
##           B     chr1       805039-805539      * |  37.51562
##         CLP     chr1       845326-845826      * |   2.76241
##         ...      ...                 ...    ... .       ...
##       CD4.N     chrX 154841576-154842076      * |   3.32381
##          NK     chrX 154842394-154842894      * |  16.09727
##   Erythroid     chrX 154912373-154912873      * |   7.72601
##        PreB     chrX 154996185-154996685      * |   3.17074
##         GMP     chrX 154996992-154997492      * |   3.47992
##             replicateScoreQuantile groupScoreQuantile Reproducibility
##                          <numeric>          <numeric>       <numeric>
##        Mono                  0.858              0.723               2
##           B                  0.947              0.882               2
##           B                  0.690              0.423               2
##           B                  0.955              0.897               2
##         CLP                  0.706              0.312               2
##         ...                    ...                ...             ...
##       CD4.N                  0.658              0.257               2
##          NK                  0.731              0.542               2
##   Erythroid                  0.394              0.116               2
##        PreB                  0.672              0.283               2
##         GMP                  0.385              0.102               2
##                     GroupReplicate distToGeneStart nearestGene    peakType
##                        <character>       <integer> <character> <character>
##        Mono  Mono._.scATAC_BMMC_R1           10179   LINC00115      Distal
##           B     B._.scATAC_PBMC_R1              32   LINC01128    Promoter
##           B     B._.scATAC_PBMC_R1           10924      FAM41C      Distal
##           B     B._.scATAC_BMMC_R1            6892      FAM41C    Intronic
##         CLP   CLP._.scATAC_BMMC_R1            9240   LINC02593      Distal
##         ...                    ...             ...         ...         ...
##       CD4.N CD4.N._.scATAC_PBMC_R1             795       TMLHE    Intronic
##          NK    NK._.scATAC_BMMC_R1              21       TMLHE    Promoter
##   Erythroid Erythroid._.scATAC_C..           70000       TMLHE      Distal
##        PreB            PreB._.Rep2          153812       TMLHE      Distal
##         GMP GMP._.scATAC_CD34_BM..          154619       TMLHE      Distal
##             distToTSS  nearestTSS        GC       idx         N
##             <integer> <character> <numeric> <integer> <numeric>
##        Mono     10179  uc010nxx.2    0.4790         1         0
##           B        32  uc031pjj.1    0.6966         2         0
##           B     10924  uc001abt.4    0.4371         3         0
##           B      6892  uc001abt.4    0.7285         4         0
##         CLP      1238  uc001abu.1    0.7904         5         0
##         ...       ...         ...       ...       ...       ...
##       CD4.N       795  uc004fnn.3    0.4172      3445         0
##          NK        21  uc004fnn.3    0.5988      3446         0
##   Erythroid     70000  uc004fnn.3    0.3653      3447         0
##        PreB      1015  uc004fnq.1    0.4251      3448         0
##         GMP       208  uc004fnq.1    0.5449      3449         0
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths
#saveArchRProject(ArchRProj = projHeme4, outputDirectory = "Save-ProjHeme4", load = FALSE)
projHeme5 <- addPeakMatrix(projHeme4)
## ArchR logging to : ArchRLogs/ArchR-addPeakMatrix-24601695a850-Date-2023-01-15_Time-21-36-30.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:36:30 : Batch Execution w/ safelapply!, 0 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-addPeakMatrix-24601695a850-Date-2023-01-15_Time-21-36-30.log
getAvailableMatrices(projHeme5)
## [1] "GeneIntegrationMatrix" "GeneScoreMatrix"       "PeakMatrix"           
## [4] "TileMatrix"
markersPeaks <- getMarkerFeatures(
    ArchRProj = projHeme5, 
    useMatrix = "PeakMatrix", 
    groupBy = "Clusters2",
  bias = c("TSSEnrichment", "log10(nFrags)"),
  testMethod = "wilcoxon"
)
## ArchR logging to : ArchRLogs/ArchR-getMarkerFeatures-246014321282-Date-2023-01-15_Time-21-37-20.log
## If there is an issue, please report to github with logFile!
## MatrixClass = Sparse.Integer.Matrix
## 2023-01-15 21:37:20 : Matching Known Biases, 0.003 mins elapsed.
## ###########
## 2023-01-15 21:38:30 : Completed Pairwise Tests, 1.175 mins elapsed.
## ###########
## ArchR logging successful to : ArchRLogs/ArchR-getMarkerFeatures-246014321282-Date-2023-01-15_Time-21-37-20.log
markersPeaks
## class: SummarizedExperiment 
## dim: 143229 11 
## metadata(2): MatchInfo Params
## assays(7): Log2FC Mean ... AUC MeanBGD
## rownames(143229): 1 2 ... 143228 143229
## rowData names(4): seqnames idx start end
## colnames(11): B CD4.M ... PreB Progenitor
## colData names(0):
markerList <- getMarkers(markersPeaks, cutOff = "FDR <= 0.01 & Log2FC >= 1", returnGR = TRUE)
markerList
## List of length 11
## names(11): B CD4.M CD4.N CLP Erythroid GMP Mono NK pDC PreB Progenitor
markerList$B
## GRanges object with 465 ranges and 3 metadata columns:
##         seqnames              ranges strand |    Log2FC         FDR  MeanDiff
##            <Rle>           <IRanges>  <Rle> | <numeric>   <numeric> <numeric>
##     [1]    chr16   88079477-88079977      * |   4.25447 1.57704e-09  0.829693
##     [2]     chr3   13152065-13152565      * |   3.29373 1.57704e-09  1.316865
##     [3]     chr2 232537187-232537687      * |   3.42329 1.88190e-08  0.998006
##     [4]    chr10   49879507-49880007      * |   6.01904 2.87913e-08  0.811835
##     [5]    chr11   60222870-60223370      * |   4.70305 4.07094e-08  0.860517
##     ...      ...                 ...    ... .       ...         ...       ...
##   [461]     chr3 169531535-169532035      * |   2.77078  0.00963295  0.424682
##   [462]     chr8 125649626-125650126      * |   1.42537  0.00963295  0.320802
##   [463]    chr20   43214896-43215396      * |   2.34248  0.00974009  0.471158
##   [464]     chr6   62995933-62996433      * |   2.01060  0.00993924  0.359606
##   [465]     chr9   79087060-79087560      * |   2.46851  0.00997262  0.292655
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths
heatmapPeaks <- markerHeatmap(
  seMarker = markersPeaks, 
  cutOff = "FDR <= 0.1 & Log2FC >= 0.5",
  transpose = TRUE
)
## Warning: 'markerHeatmap' is deprecated.
## Use 'plotMarkerHeatmap' instead.
## See help("Deprecated")
## ArchR logging to : ArchRLogs/ArchR-plotMarkerHeatmap-24602a4cc5f8-Date-2023-01-15_Time-21-38-33.log
## If there is an issue, please report to github with logFile!
## Identified 41053 markers!
##   [1] "chr1:801007-801507"     "chr1:968348-968848"     "chr1:1071786-1072286"  
##   [4] "chr1:4067611-4068111"   "chr1:4087539-4088039"   "chr1:4104872-4105372"  
##   [7] "chr1:12212042-12212542" "chr1:15208404-15208904" "chr1:15898189-15898689"
##  [10] "chr1:24054520-24055020" "chr1:24239629-24240129" "chr1:25075151-25075651"
##  [13] "chr1:27070474-27070974" "chr1:27854640-27855140" "chr1:28970219-28970719"
##  [16] "chr1:1003836-1004336"   "chr1:1004512-1005012"   "chr1:1141925-1142425"  
##  [19] "chr1:1152339-1152839"   "chr1:1259847-1260347"   "chr1:1306701-1307201"  
##  [22] "chr1:1307337-1307837"   "chr1:1624353-1624853"   "chr1:1709415-1709915"  
##  [25] "chr1:2125943-2126443"   "chr1:2163278-2163778"   "chr1:2186319-2186819"  
##  [28] "chr1:2246488-2246988"   "chr1:2455275-2455775"   "chr1:2478144-2478644"  
##  [31] "chr1:2084331-2084831"   "chr1:2474849-2475349"   "chr1:8761186-8761686"  
##  [34] "chr1:20960228-20960728" "chr1:22350842-22351342" "chr1:23810953-23811453"
##  [37] "chr1:24194675-24195175" "chr1:24741918-24742418" "chr1:24862906-24863406"
##  [40] "chr1:26644134-26644634" "chr1:27848816-27849316" "chr1:31222573-31223073"
##  [43] "chr1:36772351-36772851" "chr1:39649342-39649842" "chr1:40156880-40157380"
##  [46] "chr1:1114597-1115097"   "chr1:1144252-1144752"   "chr1:1609169-1609669"  
##  [49] "chr1:1765345-1765845"   "chr1:1838613-1839113"   "chr1:5777263-5777763"  
##  [52] "chr1:6640958-6641458"   "chr1:9777631-9778131"   "chr1:11412790-11413290"
##  [55] "chr1:11435901-11436401" "chr1:11467303-11467803" "chr1:13825006-13825506"
##  [58] "chr1:17336172-17336672" "chr1:19719613-19720113" "chr1:20539344-20539844"
##  [61] "chr1:940282-940782"     "chr1:2986471-2986971"   "chr1:3244594-3245094"  
##  [64] "chr1:3309993-3310493"   "chr1:3458489-3458989"   "chr1:3691306-3691806"  
##  [67] "chr1:3797600-3798100"   "chr1:4388438-4388938"   "chr1:5569866-5570366"  
##  [70] "chr1:6489662-6490162"   "chr1:6732543-6733043"   "chr1:7296085-7296585"  
##  [73] "chr1:7609307-7609807"   "chr1:7632733-7633233"   "chr1:8576830-8577330"  
##  [76] "chr1:3526969-3527469"   "chr1:4698287-4698787"   "chr1:5831046-5831546"  
##  [79] "chr1:6230818-6231318"   "chr1:7173691-7174191"   "chr1:8998227-8998727"  
##  [82] "chr1:9256128-9256628"   "chr1:11753213-11753713" "chr1:16212724-16213224"
##  [85] "chr1:17517855-17518355" "chr1:22232666-22233166" "chr1:25198935-25199435"
##  [88] "chr1:27288283-27288783" "chr1:36674838-36675338" "chr1:39826343-39826843"
##  [91] "chr1:890838-891338"     "chr1:1242382-1242882"   "chr1:1310497-1310997"  
##  [94] "chr1:1695647-1696147"   "chr1:1711624-1712124"   "chr1:1821570-1822070"  
##  [97] "chr1:2074239-2074739"   "chr1:2080069-2080569"   "chr1:2082652-2083152"  
## [100] "chr1:2165768-2166268"   "chr1:2177462-2177962"   "chr1:2178055-2178555"  
## [103] "chr1:2186949-2187449"   "chr1:3594022-3594522"   "chr1:3817692-3818192"  
## [106] "chr1:976000-976500"     "chr1:1148113-1148613"   "chr1:1157229-1157729"  
## [109] "chr1:1243747-1244247"   "chr1:1564723-1565223"   "chr1:1565451-1565951"  
## [112] "chr1:1566572-1567072"   "chr1:1567413-1567913"   "chr1:1623715-1624215"  
## [115] "chr1:1840413-1840913"   "chr1:2161231-2161731"   "chr1:2171009-2171509"  
## [118] "chr1:2187486-2187986"   "chr1:2236638-2237138"   "chr1:2477559-2478059"  
## [121] "chr1:2225944-2226444"   "chr1:2886250-2886750"   "chr1:6084410-6084910"  
## [124] "chr1:7022468-7022968"   "chr1:8030210-8030710"   "chr1:8537239-8537739"  
## [127] "chr1:8889516-8890016"   "chr1:10313802-10314302" "chr1:13820457-13820957"
## [130] "chr1:16023323-16023823" "chr1:16416444-16416944" "chr1:17567375-17567875"
## [133] "chr1:19732624-19733124" "chr1:20404699-20405199" "chr1:21651559-21652059"
## [136] "chr1:1774207-1774707"   "chr1:1833387-1833887"   "chr1:6647377-6647877"  
## [139] "chr1:9473434-9473934"   "chr1:11751396-11751896" "chr1:23855064-23855564"
## [142] "chr1:24055079-24055579" "chr1:24517429-24517929" "chr1:26098310-26098810"
## [145] "chr1:26867767-26868267" "chr1:26869514-26870014" "chr1:27032626-27033126"
## [148] "chr1:27853533-27854033" "chr1:27928850-27929350" "chr1:31949777-31950277"
## [151] "chr1:845326-845826"     "chr1:856373-856873"     "chr1:999469-999969"    
## [154] "chr1:1079369-1079869"   "chr1:1369562-1370062"   "chr1:1831012-1831512"  
## [157] "chr1:1935026-1935526"   "chr1:1977839-1978339"   "chr1:2058349-2058849"  
## [160] "chr1:2210550-2211050"   "chr1:2221933-2222433"   "chr1:2705485-2705985"  
## [163] "chr1:2762283-2762783"   "chr1:2804998-2805498"   "chr1:2829153-2829653"
## Adding Annotations..
## Preparing Main Heatmap..
## 'magick' package is suggested to install to give better rasterization.
## 
## Set `ht_opt$message = FALSE` to turn off this message.
## ArchR logging successful to : ArchRLogs/ArchR-plotMarkerHeatmap-24602a4cc5f8-Date-2023-01-15_Time-21-38-33.log
draw(heatmapPeaks, heatmap_legend_side = "bot", annotation_legend_side = "bot")

pv <- markerPlot(seMarker = markersPeaks, name = "Erythroid", cutOff = "FDR <= 0.1 & Log2FC >= 1", plotAs = "Volcano")
## Warning: 'markerPlot' is deprecated.
## Use 'plotMarkers' instead.
## See help("Deprecated")
pv
## Warning: Removed 58 rows containing missing values (`geom_point_rast()`).

p <- plotBrowserTrack(
    ArchRProj = projHeme5, 
    groupBy = "Clusters2", 
    geneSymbol = c("GATA1"),
    features =  getMarkers(markersPeaks, cutOff = "FDR <= 0.1 & Log2FC >= 1", returnGR = TRUE)["Erythroid"],
    upstream = 50000,
    downstream = 50000
)
## ArchR logging to : ArchRLogs/ArchR-plotBrowserTrack-246037820724-Date-2023-01-15_Time-21-38-53.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:38:53 : Validating Region, 0.002 mins elapsed.
## GRanges object with 1 range and 2 metadata columns:
##       seqnames            ranges strand |     gene_id      symbol
##          <Rle>         <IRanges>  <Rle> | <character> <character>
##   [1]     chrX 48644982-48652717      + |        2623       GATA1
##   -------
##   seqinfo: 24 sequences from hg19 genome
## 2023-01-15 21:38:53 : Adding Bulk Tracks (1 of 1), 0.002 mins elapsed.
## 2023-01-15 21:38:54 : Adding Feature Tracks (1 of 1), 0.014 mins elapsed.
## 2023-01-15 21:38:54 : Adding Gene Tracks (1 of 1), 0.015 mins elapsed.
## 2023-01-15 21:38:54 : Plotting, 0.018 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-plotBrowserTrack-246037820724-Date-2023-01-15_Time-21-38-53.log
grid::grid.newpage()
grid::grid.draw(p$GATA1)

markerTest <- getMarkerFeatures(
  ArchRProj = projHeme5, 
  useMatrix = "PeakMatrix",
  groupBy = "Clusters2",
  testMethod = "wilcoxon",
  bias = c("TSSEnrichment", "log10(nFrags)"),
  useGroups = "Erythroid",
  bgdGroups = "Progenitor"
)
## ArchR logging to : ArchRLogs/ArchR-getMarkerFeatures-24602eb169ba-Date-2023-01-15_Time-21-38-55.log
## If there is an issue, please report to github with logFile!
## MatrixClass = Sparse.Integer.Matrix
## 2023-01-15 21:38:55 : Matching Known Biases, 0.003 mins elapsed.
## 2023-01-15 21:38:56 : Computing Pairwise Tests (1 of 1), 0.018 mins elapsed.
## Pairwise Test Erythroid : Seqnames chr1
## Pairwise Test Erythroid : Seqnames chr10
## Pairwise Test Erythroid : Seqnames chr11
## Pairwise Test Erythroid : Seqnames chr12
## Pairwise Test Erythroid : Seqnames chr13
## Pairwise Test Erythroid : Seqnames chr14
## Pairwise Test Erythroid : Seqnames chr15
## Pairwise Test Erythroid : Seqnames chr16
## Pairwise Test Erythroid : Seqnames chr17
## Pairwise Test Erythroid : Seqnames chr18
## Pairwise Test Erythroid : Seqnames chr19
## Pairwise Test Erythroid : Seqnames chr2
## Pairwise Test Erythroid : Seqnames chr20
## Pairwise Test Erythroid : Seqnames chr21
## Pairwise Test Erythroid : Seqnames chr22
## Pairwise Test Erythroid : Seqnames chr3
## Pairwise Test Erythroid : Seqnames chr4
## Pairwise Test Erythroid : Seqnames chr5
## Pairwise Test Erythroid : Seqnames chr6
## Pairwise Test Erythroid : Seqnames chr7
## Pairwise Test Erythroid : Seqnames chr8
## Pairwise Test Erythroid : Seqnames chr9
## Pairwise Test Erythroid : Seqnames chrX
## ###########
## 2023-01-15 21:39:17 : Completed Pairwise Tests, 0.355 mins elapsed.
## ###########
## ArchR logging successful to : ArchRLogs/ArchR-getMarkerFeatures-24602eb169ba-Date-2023-01-15_Time-21-38-55.log
markerPlot(seMarker = markerTest, name = "Erythroid", cutOff = "FDR <= 0.1 & abs(Log2FC) >= 1", plotAs = "Volcano")
## Warning: 'markerPlot' is deprecated.
## Use 'plotMarkers' instead.
## See help("Deprecated")
## Warning: Removed 55 rows containing missing values (`geom_point_rast()`).

#saveArchRProject(ArchRProj = projHeme5, outputDirectory = "Save-ProjHeme5", load = FALSE)

Motif and Feature Enrichment

projHeme5 <- addMotifAnnotations(ArchRProj = projHeme5, motifSet = "cisbp", name = "Motif")
## ArchR logging to : ArchRLogs/ArchR-addMotifAnnotations-246073fc7fbb-Date-2023-01-15_Time-21-39-25.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:39:27 : Gettting Motif Set, Species : Homo sapiens, 0.002 mins elapsed.
## Using version 2 motifs!
## 2023-01-15 21:39:28 : Finding Motif Positions with motifmatchr!, 0.029 mins elapsed.
## 2023-01-15 21:41:42 : All Motifs Overlap at least 1 peak!, 2.26 mins elapsed.
## 2023-01-15 21:41:42 : Creating Motif Overlap Matrix, 2.26 mins elapsed.
## 2023-01-15 21:41:45 : Finished Getting Motif Info!, 2.302 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-addMotifAnnotations-246073fc7fbb-Date-2023-01-15_Time-21-39-25.log
motifsUp <- peakAnnoEnrichment(
    seMarker = markerTest,
    ArchRProj = projHeme5,
    peakAnnotation = "Motif",
    cutOff = "FDR <= 0.1 & Log2FC >= 0.5"
  )
## ArchR logging to : ArchRLogs/ArchR-peakAnnoEnrichment-24606b384c90-Date-2023-01-15_Time-21-41-46.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:41:49 : Computing Enrichments 1 of 1, 0.057 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-peakAnnoEnrichment-24606b384c90-Date-2023-01-15_Time-21-41-46.log
motifsUp
## class: SummarizedExperiment 
## dim: 870 1 
## metadata(0):
## assays(10): mlog10Padj mlog10p ... CompareFrequency feature
## rownames(870): TFAP2B_1 TFAP2D_2 ... TBX18_869 TBX22_870
## rowData names(0):
## colnames(1): Erythroid
## colData names(0):
df <- data.frame(TF = rownames(motifsUp), mlog10Padj = assay(motifsUp)[,1])
df <- df[order(df$mlog10Padj, decreasing = TRUE),]
df$rank <- seq_len(nrow(df))
head(df)
##            TF mlog10Padj rank
## 383 GATA1_383   507.2283    1
## 388 GATA2_388   497.4459    2
## 384 GATA3_384   422.9416    3
## 385 GATA5_385   409.9321    4
## 386 GATA4_386   315.7656    5
## 387 GATA6_387   214.8645    6
ggplot(df, aes(rank, mlog10Padj, color = mlog10Padj)) + 
  geom_point(size = 1) +
  ggrepel::geom_label_repel(
        data = df[rev(seq_len(30)), ], aes(x = rank, y = mlog10Padj, label = TF), 
        size = 1.5,
        nudge_x = 2,
        color = "black"
  ) + theme_ArchR() + 
  ylab("-log10(P-adj) Motif Enrichment") + 
  xlab("Rank Sorted TFs Enriched") +
  scale_color_gradientn(colors = paletteContinuous(set = "comet"))
## Warning: ggrepel: 22 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

enrichMotifs <- peakAnnoEnrichment(
    seMarker = markersPeaks,
    ArchRProj = projHeme5,
    peakAnnotation = "Motif",
    cutOff = "FDR <= 0.1 & Log2FC >= 0.5"
  )
## ArchR logging to : ArchRLogs/ArchR-peakAnnoEnrichment-246058087f26-Date-2023-01-15_Time-21-41-50.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:41:54 : Computing Enrichments 1 of 11, 0.056 mins elapsed.
## 2023-01-15 21:41:54 : Computing Enrichments 2 of 11, 0.062 mins elapsed.
## 2023-01-15 21:41:54 : Computing Enrichments 3 of 11, 0.065 mins elapsed.
## 2023-01-15 21:41:55 : Computing Enrichments 4 of 11, 0.07 mins elapsed.
## 2023-01-15 21:41:55 : Computing Enrichments 5 of 11, 0.075 mins elapsed.
## 2023-01-15 21:41:55 : Computing Enrichments 6 of 11, 0.078 mins elapsed.
## 2023-01-15 21:41:55 : Computing Enrichments 7 of 11, 0.083 mins elapsed.
## 2023-01-15 21:41:56 : Computing Enrichments 8 of 11, 0.087 mins elapsed.
## 2023-01-15 21:41:56 : Computing Enrichments 9 of 11, 0.092 mins elapsed.
## 2023-01-15 21:41:56 : Computing Enrichments 10 of 11, 0.095 mins elapsed.
## 2023-01-15 21:41:56 : Computing Enrichments 11 of 11, 0.1 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-peakAnnoEnrichment-246058087f26-Date-2023-01-15_Time-21-41-50.log
heatmapEM <- plotEnrichHeatmap(enrichMotifs, n = 7, transpose = TRUE)
## ArchR logging to : ArchRLogs/ArchR-plotEnrichHeatmap-2460226b569-Date-2023-01-15_Time-21-41-57.log
## If there is an issue, please report to github with logFile!
## Adding Annotations..
## Preparing Main Heatmap..
## 'magick' package is suggested to install to give better rasterization.
## 
## Set `ht_opt$message = FALSE` to turn off this message.
ComplexHeatmap::draw(heatmapEM, heatmap_legend_side = "bot", annotation_legend_side = "bot")

## The overlap with bulk ATAC-seq peak set

projHeme5 <- addArchRAnnotations(ArchRProj = projHeme5, collection = "ATAC")
## ArchR logging to : ArchRLogs/ArchR-addArchRAnnotations-24606461704e-Date-2023-01-15_Time-21-41-58.log
## If there is an issue, please report to github with logFile!
## Annotating Chromosomes
## 2023-01-15 21:41:58 :
##  Annotating Chr: chr1
## 2023-01-15 21:41:58 :
##  Annotating Chr: chr2
## 2023-01-15 21:41:58 :
##  Annotating Chr: chr3
## 2023-01-15 21:41:58 :
##  Annotating Chr: chr4
## 2023-01-15 21:41:58 :
##  Annotating Chr: chr5
## 2023-01-15 21:41:59 :
##  Annotating Chr: chr6
## 2023-01-15 21:41:59 :
##  Annotating Chr: chr7
## 2023-01-15 21:41:59 :
##  Annotating Chr: chr8
## 2023-01-15 21:41:59 :
##  Annotating Chr: chr9
## 2023-01-15 21:41:59 :
##  Annotating Chr: chr10
## 2023-01-15 21:41:59 :
##  Annotating Chr: chr11
## 2023-01-15 21:41:59 :
##  Annotating Chr: chr12
## 2023-01-15 21:41:59 :
##  Annotating Chr: chr13
## 2023-01-15 21:41:59 :
##  Annotating Chr: chr14
## 2023-01-15 21:41:59 :
##  Annotating Chr: chr15
## 2023-01-15 21:42:00 :
##  Annotating Chr: chr16
## 2023-01-15 21:42:00 :
##  Annotating Chr: chr17
## 2023-01-15 21:42:00 :
##  Annotating Chr: chr18
## 2023-01-15 21:42:00 :
##  Annotating Chr: chr19
## 2023-01-15 21:42:00 :
##  Annotating Chr: chr20
## 2023-01-15 21:42:00 :
##  Annotating Chr: chr21
## 2023-01-15 21:42:00 :
##  Annotating Chr: chr22
## 2023-01-15 21:42:00 :
##  Annotating Chr: chrX
## 2023-01-15 21:42:00 :
## 2023-01-15 21:42:01 : All Regions Overlap at least 1 peak!, 0.052 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-addArchRAnnotations-24606461704e-Date-2023-01-15_Time-21-41-58.log
enrichATAC <- peakAnnoEnrichment(
    seMarker = markersPeaks,
    ArchRProj = projHeme5,
    peakAnnotation = "ATAC",
    cutOff = "FDR <= 0.1 & Log2FC >= 0.5"
  )
## ArchR logging to : ArchRLogs/ArchR-peakAnnoEnrichment-24603914445e-Date-2023-01-15_Time-21-42-01.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:42:05 : Computing Enrichments 1 of 11, 0.051 mins elapsed.
## 2023-01-15 21:42:05 : Computing Enrichments 2 of 11, 0.054 mins elapsed.
## 2023-01-15 21:42:05 : Computing Enrichments 3 of 11, 0.055 mins elapsed.
## 2023-01-15 21:42:05 : Computing Enrichments 4 of 11, 0.057 mins elapsed.
## 2023-01-15 21:42:05 : Computing Enrichments 5 of 11, 0.059 mins elapsed.
## 2023-01-15 21:42:05 : Computing Enrichments 6 of 11, 0.061 mins elapsed.
## 2023-01-15 21:42:05 : Computing Enrichments 7 of 11, 0.063 mins elapsed.
## 2023-01-15 21:42:05 : Computing Enrichments 8 of 11, 0.064 mins elapsed.
## 2023-01-15 21:42:06 : Computing Enrichments 9 of 11, 0.067 mins elapsed.
## 2023-01-15 21:42:06 : Computing Enrichments 10 of 11, 0.069 mins elapsed.
## 2023-01-15 21:42:06 : Computing Enrichments 11 of 11, 0.07 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-peakAnnoEnrichment-24603914445e-Date-2023-01-15_Time-21-42-01.log
heatmapATAC <- plotEnrichHeatmap(enrichATAC, n = 7, transpose = TRUE)
## ArchR logging to : ArchRLogs/ArchR-plotEnrichHeatmap-246061a7b934-Date-2023-01-15_Time-21-42-06.log
## If there is an issue, please report to github with logFile!
## Adding Annotations..
## Preparing Main Heatmap..
## 'magick' package is suggested to install to give better rasterization.
## 
## Set `ht_opt$message = FALSE` to turn off this message.
ComplexHeatmap::draw(heatmapATAC, heatmap_legend_side = "bot", annotation_legend_side = "bot")

ChromVAR Deviatons Enrichment

chromVAR predicts enrichment of TF activity on a per-cell basis. It also corrects the Tn5 transposase insertion sequence bias.

if("Motif" %ni% names(projHeme5@peakAnnotation)){
    projHeme5 <- addMotifAnnotations(ArchRProj = projHeme5, motifSet = "cisbp", name = "Motif")
}
projHeme5 <- addBgdPeaks(projHeme5)
## Identifying Background Peaks!
# This function save the result into the "MotifMatrix" matrix which can be used later.
projHeme5 <- addDeviationsMatrix(
  ArchRProj = projHeme5, 
  peakAnnotation = "Motif",
  force = TRUE
)
## Using Previous Background Peaks!
## ArchR logging to : ArchRLogs/ArchR-addDeviationsMatrix-2460738b4101-Date-2023-01-15_Time-21-42-14.log
## If there is an issue, please report to github with logFile!
## NULL
## as(<lgCMatrix>, "dgCMatrix") is deprecated since Matrix 1.5-0; do as(., "dMatrix") instead
## 2023-01-15 21:42:17 : Batch Execution w/ safelapply!, 0 mins elapsed.
## ###########
## 2023-01-15 21:49:41 : Completed Computing Deviations!, 7.454 mins elapsed.
## ###########
## ArchR logging successful to : ArchRLogs/ArchR-addDeviationsMatrix-2460738b4101-Date-2023-01-15_Time-21-42-14.log

The variation of each TF motif across all cells.

plotVarDev <- getVarDeviations(projHeme5, name = "MotifMatrix", plot = TRUE)
## DataFrame with 6 rows and 6 columns
##      seqnames     idx        name combinedVars combinedMeans      rank
##         <Rle> <array>     <array>    <numeric>     <numeric> <integer>
## f388        z     388   GATA2_388      11.8166    -0.0346497         1
## f155        z     155   CEBPA_155      11.5241    -0.1882559         2
## f336        z     336    SPIB_336      11.5169    -0.0824527         3
## f383        z     383   GATA1_383      11.5130    -0.0435856         4
## f651        z     651 SMARCC1_651      10.4661    -0.1335942         5
## f385        z     385   GATA5_385      10.3523    -0.0341025         6
plotVarDev
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Check the variation of target TF motifs across all cells. ### Get the motif names corresponding to each TF

motifs <- c("GATA1", "CEBPA", "EBF1", "IRF4", "TBX21", "PAX5")
markerMotifs <- getFeatures(projHeme5, select = paste(motifs, collapse="|"), useMatrix = "MotifMatrix")
markerMotifs
##  [1] "z:TBX21_780"          "z:PAX5_709"           "z:IRF4_632"          
##  [4] "z:GATA1_383"          "z:CEBPA_155"          "z:EBF1_67"           
##  [7] "z:SREBF1_22"          "deviations:TBX21_780" "deviations:PAX5_709" 
## [10] "deviations:IRF4_632"  "deviations:GATA1_383" "deviations:CEBPA_155"
## [13] "deviations:EBF1_67"   "deviations:SREBF1_22"

Motif names clean up

markerMotifs <- grep("z:", markerMotifs, value = TRUE)
markerMotifs <- markerMotifs[markerMotifs %ni% "z:SREBF1_22"]
markerMotifs
## [1] "z:TBX21_780" "z:PAX5_709"  "z:IRF4_632"  "z:GATA1_383" "z:CEBPA_155"
## [6] "z:EBF1_67"
projHeme5 <- addImputeWeights(projHeme5)
## ArchR logging to : ArchRLogs/ArchR-addImputeWeights-246015784905-Date-2023-01-15_Time-21-49-44.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:49:44 : Computing Impute Weights Using Magic (Cell 2018), 0 mins elapsed.
p <- plotGroups(ArchRProj = projHeme5, 
  groupBy = "Clusters2", 
  colorBy = "MotifMatrix", 
  name = markerMotifs,
  imputeWeights = getImputeWeights(projHeme5)
)
## Getting ImputeWeights
## Getting Matrix Values...
## 2023-01-15 21:49:52 :
## 
## ArchR logging to : ArchRLogs/ArchR-imputeMatrix-246053c8d7d2-Date-2023-01-15_Time-21-49-54.log
## If there is an issue, please report to github with logFile!
## Using weights on disk
## Using weights on disk
## 1 2 3 4 5 6
p2 <- lapply(seq_along(p), function(x){
  if(x != 1){
    p[[x]] + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6) +
    theme(plot.margin = unit(c(0.1, 0.1, 0.1, 0.1), "cm")) +
    theme(
        axis.text.y=element_blank(), 
        axis.ticks.y=element_blank(),
        axis.title.y=element_blank()
    ) + ylab("")
  }else{
    p[[x]] + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6) +
    theme(plot.margin = unit(c(0.1, 0.1, 0.1, 0.1), "cm")) +
    theme(
        axis.ticks.y=element_blank(),
        axis.title.y=element_blank()
    ) + ylab("")
  }
})
do.call(cowplot::plot_grid, c(list(nrow = 1, rel_widths = c(2, rep(1, length(p2) - 1))),p2))
## Picking joint bandwidth of 0.0196
## Picking joint bandwidth of 0.0243
## Picking joint bandwidth of 0.068
## Picking joint bandwidth of 0.114
## Picking joint bandwidth of 0.0911
## Picking joint bandwidth of 0.0702

### Plot the motif enrichment z-scores on UMAP

p <- plotEmbedding(
    ArchRProj = projHeme5, 
    colorBy = "MotifMatrix", 
    name = sort(markerMotifs), 
    embedding = "UMAP",
    imputeWeights = getImputeWeights(projHeme5)
)
## Getting ImputeWeights
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-24603cace528-Date-2023-01-15_Time-21-49-58.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = MotifMatrix
## Getting Matrix Values...
## 2023-01-15 21:49:58 :
## 
## Imputing Matrix
## Using weights on disk
## Using weights on disk
## Plotting Embedding
## 1
## Warning in file(file, if (append) "a" else "w"): cannot open file
## 'ArchRLogs/ArchR-plotEmbedding-24603cace528-Date-2023-01-15_Time-21-49-58.log':
## Interrupted system call
## 2 3 4 5 6 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-24603cace528-Date-2023-01-15_Time-21-49-58.log
p2 <- lapply(p, function(x){
    x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
        axis.text.x=element_blank(), 
        axis.ticks.x=element_blank(), 
        axis.text.y=element_blank(), 
        axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 3),p2))

### Plot the previously infered gene expression scores on UMAP and compare with motif enrichment z-scores

markerRNA <- getFeatures(projHeme5, select = paste(motifs, collapse="|"), useMatrix = "GeneScoreMatrix")
markerRNA <- markerRNA[markerRNA %ni% c("SREBF1","CEBPA-DT")]
markerRNA
## [1] "TBX21" "CEBPA" "EBF1"  "IRF4"  "PAX5"  "GATA1"
p <- plotEmbedding(
    ArchRProj = projHeme5, 
    colorBy = "GeneScoreMatrix", 
    name = sort(markerRNA), 
    embedding = "UMAP",
    imputeWeights = getImputeWeights(projHeme5)
)
## Getting ImputeWeights
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-24604037a2a6-Date-2023-01-15_Time-21-50-08.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = GeneScoreMatrix
## Getting Matrix Values...
## 2023-01-15 21:50:08 :
## 
## Imputing Matrix
## Using weights on disk
## Using weights on disk
## Plotting Embedding
## 1 2 3 4 5 6 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-24604037a2a6-Date-2023-01-15_Time-21-50-08.log
p2 <- lapply(p, function(x){
    x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
        axis.text.x=element_blank(), 
        axis.ticks.x=element_blank(), 
        axis.text.y=element_blank(), 
        axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 3),p2))

We previously linked our scATAC-seq data with corresponding scRNA-seq data, also plot the linked gene expression values

markerRNA <- getFeatures(projHeme5, select = paste(motifs, collapse="|"), useMatrix = "GeneIntegrationMatrix")
markerRNA <- markerRNA[markerRNA %ni% c("SREBF1","CEBPA-DT")]
markerRNA
## [1] "TBX21" "CEBPA" "EBF1"  "IRF4"  "PAX5"  "GATA1"
p <- plotEmbedding(
    ArchRProj = projHeme5, 
    colorBy = "GeneIntegrationMatrix", 
    name = sort(markerRNA), 
    embedding = "UMAP",
    continuousSet = "blueYellow",
    imputeWeights = getImputeWeights(projHeme5)
)
## Getting ImputeWeights
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-24605d9281b5-Date-2023-01-15_Time-21-50-17.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = GeneIntegrationMatrix
## Getting Matrix Values...
## 2023-01-15 21:50:18 :
## 
## Imputing Matrix
## Using weights on disk
## Using weights on disk
## Plotting Embedding
## 1 2 3 4 5 6 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-24605d9281b5-Date-2023-01-15_Time-21-50-17.log
p2 <- lapply(p, function(x){
    x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
        axis.text.x=element_blank(), 
        axis.ticks.x=element_blank(), 
        axis.text.y=element_blank(), 
        axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 3),p2))

## ChromVAR custom deviation ### Calculate enrichment using bulk ATAC-seq peaks

if("ATAC" %ni% names(projHeme5@peakAnnotation)){
    projHeme5 <- addArchRAnnotations(ArchRProj = projHeme5, collection = "ATAC")
}
projHeme5 <- addDeviationsMatrix(
  ArchRProj = projHeme5, 
  peakAnnotation = "ATAC",
  force = TRUE
)
## Using Previous Background Peaks!
## ArchR logging to : ArchRLogs/ArchR-addDeviationsMatrix-24601a018088-Date-2023-01-15_Time-21-50-28.log
## If there is an issue, please report to github with logFile!
## NULL
## 2023-01-15 21:50:30 : Batch Execution w/ safelapply!, 0 mins elapsed.
## ###########
## 2023-01-15 21:53:21 : Completed Computing Deviations!, 2.887 mins elapsed.
## ###########
## ArchR logging successful to : ArchRLogs/ArchR-addDeviationsMatrix-24601a018088-Date-2023-01-15_Time-21-50-28.log
plotVarDev <- getVarDeviations(projHeme5, plot = TRUE, name = "ATACMatrix")
## DataFrame with 6 rows and 6 columns
##     seqnames     idx                  name combinedVars combinedMeans      rank
##        <Rle> <array>               <array>    <numeric>     <numeric> <integer>
## f22        z      22 IAtlas_T_CD8posCenMem      12.8024    -0.0918708         1
## f86        z      86              Heme_CD8      12.5015    -0.0686054         2
## f85        z      85              Heme_CD4      12.3654    -0.0506556         3
## f23        z      23 IAtlas_T_CD8posEffMem      12.2010    -0.0997087         4
## f21        z      21       IAtlas_T_CD8pos      12.1765    -0.0840077         5
## f33        z      33 IAtlas_T_Th1Precursor      12.1520    -0.0830697         6
plotVarDev
## Warning: ggrepel: 16 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ATACPeaks <- c("Heme_HSC", "Heme_LMPP", "Heme_Ery", "Heme_Mono", "Heme_CD4", "Heme_CD8", "Heme_B", "Heme_NK", "IAtlas_DC_Plasmacytoid")
markerATAC <- getFeatures(projHeme5, select = paste(ATACPeaks, collapse="|"), useMatrix = "ATACMatrix")
markerATAC <- sort(grep("z:", markerATAC, value = TRUE))
markerATAC
## [1] "z:Heme_B"                 "z:Heme_CD4"              
## [3] "z:Heme_CD8"               "z:Heme_Ery"              
## [5] "z:Heme_HSC"               "z:Heme_LMPP"             
## [7] "z:Heme_Mono"              "z:Heme_NK"               
## [9] "z:IAtlas_DC_Plasmacytoid"
p <- plotEmbedding(
    ArchRProj = projHeme5, 
    colorBy = "ATACMatrix", 
    name = markerATAC, 
    embedding = "UMAP",
    imputeWeights = getImputeWeights(projHeme5)
)
## Getting ImputeWeights
## ArchR logging to : ArchRLogs/ArchR-plotEmbedding-2460125aed26-Date-2023-01-15_Time-21-53-24.log
## If there is an issue, please report to github with logFile!
## Getting UMAP Embedding
## ColorBy = ATACMatrix
## Getting Matrix Values...
## 2023-01-15 21:53:24 :
## 
## Imputing Matrix
## Using weights on disk
## Using weights on disk
## Plotting Embedding
## 1 2 3 4 5 6 7 8 9 
## ArchR logging successful to : ArchRLogs/ArchR-plotEmbedding-2460125aed26-Date-2023-01-15_Time-21-53-24.log
p2 <- lapply(p, function(x){
    x + guides(color = FALSE, fill = FALSE) + 
    theme_ArchR(baseSize = 6.5) +
    theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) +
    theme(
        axis.text.x=element_blank(), 
        axis.ticks.x=element_blank(), 
        axis.text.y=element_blank(), 
        axis.ticks.y=element_blank()
    )
})
do.call(cowplot::plot_grid, c(list(ncol = 3),p2))

# Motif footprinting

#Get the "Motif" (CIS-BP motifs) from peakAnnotation slot
motifPositions <- getPositions(projHeme5,name="Motif")
motifPositions
## GRangesList object of length 870:
## $TFAP2B_1
## GRanges object with 16724 ranges and 1 metadata column:
##           seqnames              ranges strand |     score
##              <Rle>           <IRanges>  <Rle> | <numeric>
##       [1]     chr1       873916-873927      + |   8.30742
##       [2]     chr1       873916-873927      - |   8.30742
##       [3]     chr1       875505-875516      + |  10.20625
##       [4]     chr1       875505-875516      - |   9.04384
##       [5]     chr1       896671-896682      + |   9.94830
##       ...      ...                 ...    ... .       ...
##   [16720]     chrX 154032970-154032981      + |   8.57909
##   [16721]     chrX 154299568-154299579      + |   8.88679
##   [16722]     chrX 154664929-154664940      - |   8.15284
##   [16723]     chrX 154807684-154807695      + |   9.56155
##   [16724]     chrX 154807684-154807695      - |  10.59781
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths
## 
## ...
## <869 more elements>
motifs <- c("GATA1", "CEBPA", "EBF1", "IRF4", "TBX21", "PAX5")
markerMotifs <- unlist(lapply(motifs, function(x) grep(x, names(motifPositions), value = TRUE)))
markerMotifs <- markerMotifs[markerMotifs %ni% "SREBF1_22"]
markerMotifs
## [1] "GATA1_383" "CEBPA_155" "EBF1_67"   "IRF4_632"  "TBX21_780" "PAX5_709"
seFoot <- getFootprints(
  ArchRProj = projHeme5, 
  positions = motifPositions[markerMotifs], 
  groupBy = "Clusters2"
)
## ArchR logging to : ArchRLogs/ArchR-getFootprints-24607fe48132-Date-2023-01-15_Time-21-53-34.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:53:34 : Computing Kmer Bias Table, 0.003 mins elapsed.
## 2023-01-15 21:53:42 : Finished Computing Kmer Tables, 0.123 mins elapsed.
## 2023-01-15 21:53:42 : Computing Footprints, 0.126 mins elapsed.
## 2023-01-15 21:53:57 : Computing Footprints Bias, 0.383 mins elapsed.
## 2023-01-15 21:54:10 : Summarizing Footprints, 0.6 mins elapsed.

Normalization of Footprints for Tn5 Bias

Without Tn5 Bias normalization, Subtracting the Tn5 Bias or Dividing by the Tn5 Bias. Results in /HemeOutput/Plots

plotFootprints(
  seFoot = seFoot,
  ArchRProj = projHeme5, 
  normMethod = "None",
  plotName = "Footprints-No-Normalization",
  addDOC = FALSE,
  smoothWindow = 5
)
## ArchR logging to : ArchRLogs/ArchR-plotFootprints-24605c019124-Date-2023-01-15_Time-21-54-12.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:54:12 : Plotting Footprint : GATA1_383 (1 of 6), 0.001 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = None
## 2023-01-15 21:54:13 : Plotting Footprint : CEBPA_155 (2 of 6), 0.023 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = None
## 2023-01-15 21:54:14 : Plotting Footprint : EBF1_67 (3 of 6), 0.045 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = None
## 2023-01-15 21:54:16 : Plotting Footprint : IRF4_632 (4 of 6), 0.068 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = None
## 2023-01-15 21:54:17 : Plotting Footprint : TBX21_780 (5 of 6), 0.089 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = None
## 2023-01-15 21:54:18 : Plotting Footprint : PAX5_709 (6 of 6), 0.112 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = None
## ArchR logging successful to : ArchRLogs/ArchR-plotFootprints-24605c019124-Date-2023-01-15_Time-21-54-12.log
plotFootprints(
  seFoot = seFoot,
  ArchRProj = projHeme5, 
  normMethod = "Subtract",
  plotName = "Footprints-Subtract-Bias",
  addDOC = FALSE,
  smoothWindow = 5
)
## ArchR logging to : ArchRLogs/ArchR-plotFootprints-2460351f539-Date-2023-01-15_Time-21-54-20.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:54:20 : Plotting Footprint : GATA1_383 (1 of 6), 0.001 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = Subtract
## 2023-01-15 21:54:21 : Plotting Footprint : CEBPA_155 (2 of 6), 0.023 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = Subtract
## 2023-01-15 21:54:22 : Plotting Footprint : EBF1_67 (3 of 6), 0.045 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = Subtract
## 2023-01-15 21:54:24 : Plotting Footprint : IRF4_632 (4 of 6), 0.07 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = Subtract
## 2023-01-15 21:54:25 : Plotting Footprint : TBX21_780 (5 of 6), 0.093 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = Subtract
## 2023-01-15 21:54:27 : Plotting Footprint : PAX5_709 (6 of 6), 0.116 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = Subtract
## ArchR logging successful to : ArchRLogs/ArchR-plotFootprints-2460351f539-Date-2023-01-15_Time-21-54-20.log
plotFootprints(
  seFoot = seFoot,
  ArchRProj = projHeme5, 
  normMethod = "Divide",
  plotName = "Footprints-Divide-Bias",
  addDOC = FALSE,
  smoothWindow = 5
)
## ArchR logging to : ArchRLogs/ArchR-plotFootprints-2460312e5133-Date-2023-01-15_Time-21-54-28.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:54:28 : Plotting Footprint : GATA1_383 (1 of 6), 0.002 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = Divide
## 2023-01-15 21:54:29 : Plotting Footprint : CEBPA_155 (2 of 6), 0.024 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = Divide
## 2023-01-15 21:54:31 : Plotting Footprint : EBF1_67 (3 of 6), 0.047 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = Divide
## 2023-01-15 21:54:32 : Plotting Footprint : IRF4_632 (4 of 6), 0.069 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = Divide
## 2023-01-15 21:54:34 : Plotting Footprint : TBX21_780 (5 of 6), 0.093 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = Divide
## 2023-01-15 21:54:35 : Plotting Footprint : PAX5_709 (6 of 6), 0.115 mins elapsed.
## Applying smoothing window to footprint
## Normalizing by flanking regions
## NormMethod = Divide
## ArchR logging successful to : ArchRLogs/ArchR-plotFootprints-2460312e5133-Date-2023-01-15_Time-21-54-28.log

Footprinting with user-defined feature set

seTSS <- getFootprints(
  ArchRProj = projHeme5, 
  positions = GRangesList(TSS = getTSS(projHeme5)), 
  groupBy = "Clusters2",
  flank = 2000
)
## ArchR logging to : ArchRLogs/ArchR-getFootprints-246052d17adc-Date-2023-01-15_Time-21-54-36.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:54:36 : Computing Kmer Bias Table, 0.002 mins elapsed.
## 2023-01-15 21:54:54 : Finished Computing Kmer Tables, 0.286 mins elapsed.
## 2023-01-15 21:54:54 : Computing Footprints, 0.288 mins elapsed.
## 2023-01-15 21:55:10 : Computing Footprints Bias, 0.561 mins elapsed.
## 2023-01-15 21:55:25 : Summarizing Footprints, 0.808 mins elapsed.
plotFootprints(
  seFoot = seTSS,
  ArchRProj = projHeme5, 
  normMethod = "None",
  plotName = "TSS-No-Normalization",
  addDOC = FALSE,
  flank = 2000,
  flankNorm = 100
)
## ArchR logging to : ArchRLogs/ArchR-plotFootprints-246036d92bfe-Date-2023-01-15_Time-21-55-26.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:55:26 : Plotting Footprint : TSS (1 of 1), 0.001 mins elapsed.
## Normalizing by flanking regions
## NormMethod = None
## ArchR logging successful to : ArchRLogs/ArchR-plotFootprints-246036d92bfe-Date-2023-01-15_Time-21-55-26.log

Co-accessibility

projHeme5 <- addCoAccessibility(
    ArchRProj = projHeme5,
    reducedDims = "IterativeLSI"
)
## ArchR logging to : ArchRLogs/ArchR-addCoAccessibility-246043a887da-Date-2023-01-15_Time-21-55-35.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:55:35 : Computing KNN, 0.002 mins elapsed.
## 2023-01-15 21:55:35 : Identifying Non-Overlapping KNN pairs, 0.004 mins elapsed.
## 2023-01-15 21:55:38 : Identified 494 Groupings!, 0.065 mins elapsed.
## 2023-01-15 21:55:40 : Computing Co-Accessibility chr1 (1 of 23), 0.09 mins elapsed.
## 2023-01-15 21:55:45 : Computing Co-Accessibility chr2 (2 of 23), 0.171 mins elapsed.
## 2023-01-15 21:55:48 : Computing Co-Accessibility chr3 (3 of 23), 0.232 mins elapsed.
## 2023-01-15 21:55:52 : Computing Co-Accessibility chr4 (4 of 23), 0.286 mins elapsed.
## 2023-01-15 21:55:55 : Computing Co-Accessibility chr5 (5 of 23), 0.334 mins elapsed.
## 2023-01-15 21:55:58 : Computing Co-Accessibility chr6 (6 of 23), 0.384 mins elapsed.
## 2023-01-15 21:56:01 : Computing Co-Accessibility chr7 (7 of 23), 0.438 mins elapsed.
## 2023-01-15 21:56:04 : Computing Co-Accessibility chr8 (8 of 23), 0.489 mins elapsed.
## 2023-01-15 21:56:07 : Computing Co-Accessibility chr9 (9 of 23), 0.537 mins elapsed.
## 2023-01-15 21:56:10 : Computing Co-Accessibility chr10 (10 of 23), 0.586 mins elapsed.
## 2023-01-15 21:56:13 : Computing Co-Accessibility chr11 (11 of 23), 0.636 mins elapsed.
## 2023-01-15 21:56:16 : Computing Co-Accessibility chr12 (12 of 23), 0.69 mins elapsed.
## 2023-01-15 21:56:19 : Computing Co-Accessibility chr13 (13 of 23), 0.743 mins elapsed.
## 2023-01-15 21:56:22 : Computing Co-Accessibility chr14 (14 of 23), 0.785 mins elapsed.
## 2023-01-15 21:56:24 : Computing Co-Accessibility chr15 (15 of 23), 0.831 mins elapsed.
## 2023-01-15 21:56:27 : Computing Co-Accessibility chr16 (16 of 23), 0.877 mins elapsed.
## 2023-01-15 21:56:30 : Computing Co-Accessibility chr17 (17 of 23), 0.926 mins elapsed.
## 2023-01-15 21:56:33 : Computing Co-Accessibility chr18 (18 of 23), 0.981 mins elapsed.
## 2023-01-15 21:56:36 : Computing Co-Accessibility chr19 (19 of 23), 1.022 mins elapsed.
## 2023-01-15 21:56:39 : Computing Co-Accessibility chr20 (20 of 23), 1.078 mins elapsed.
## 2023-01-15 21:56:42 : Computing Co-Accessibility chr21 (21 of 23), 1.123 mins elapsed.
## 2023-01-15 21:56:44 : Computing Co-Accessibility chr22 (22 of 23), 1.162 mins elapsed.
## 2023-01-15 21:56:47 : Computing Co-Accessibility chrX (23 of 23), 1.205 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-addCoAccessibility-246043a887da-Date-2023-01-15_Time-21-55-35.log
cA <- getCoAccessibility(
    ArchRProj = projHeme5,
    corCutOff = 0.5,
    resolution = 1,
    returnLoops = FALSE
)
cA
## DataFrame with 95944 rows and 11 columns
##       queryHits subjectHits seqnames correlation Variability1 Variability2
##       <integer>   <integer>    <Rle>   <numeric>    <numeric>    <numeric>
## 1             5           7     chr1    0.729628   0.00559909   0.02913767
## 2             5          33     chr1    0.504182   0.00559909   0.01104183
## 3             6           7     chr1    0.511142   0.00409733   0.02913767
## 4             7           5     chr1    0.729628   0.02913767   0.00559909
## 5             7           6     chr1    0.511142   0.02913767   0.00409733
## ...         ...         ...      ...         ...          ...          ...
## 95940    143154      143146     chrX    0.532194   0.00390673   0.02920153
## 95941    143159      143177     chrX    0.513681   0.00870711   0.01454613
## 95942    143177      143159     chrX    0.513681   0.01454613   0.00870711
## 95943    143198      143199     chrX    0.532867   0.00947622   0.00250734
## 95944    143199      143198     chrX    0.532867   0.00250734   0.00947622
##           TStat        Pval         FDR VarQuantile1 VarQuantile2
##       <numeric>   <numeric>   <numeric>    <numeric>    <numeric>
## 1       23.6661 3.09624e-83 1.27170e-80     0.570949     0.918611
## 2       12.9497 3.27901e-33 1.18820e-31     0.570949     0.744449
## 3       13.1911 3.11045e-34 1.19563e-32     0.480444     0.918611
## 4       23.6661 3.09624e-83 1.27170e-80     0.918611     0.570949
## 5       13.1911 3.11045e-34 1.19563e-32     0.918611     0.480444
## ...         ...         ...         ...          ...          ...
## 95940   13.9432 1.78166e-37 8.28743e-36     0.465737     0.918982
## 95941   13.2800 1.29936e-34 5.11388e-33     0.689446     0.799879
## 95942   13.2800 1.29936e-34 5.11388e-33     0.799879     0.689446
## 95943   13.9678 1.39133e-37 6.50832e-36     0.711361     0.329707
## 95944   13.9678 1.39133e-37 6.50832e-36     0.329707     0.711361
metadata(cA)[[1]]
## GRanges object with 143229 ranges and 0 metadata columns:
##             seqnames              ranges strand
##                <Rle>           <IRanges>  <Rle>
##        Mono     chr1       752472-752972      *
##           B     chr1       762688-763188      *
##           B     chr1       801007-801507      *
##           B     chr1       805039-805539      *
##         CLP     chr1       845326-845826      *
##         ...      ...                 ...    ...
##       CD4.N     chrX 154841576-154842076      *
##          NK     chrX 154842394-154842894      *
##   Erythroid     chrX 154912373-154912873      *
##        PreB     chrX 154996185-154996685      *
##         GMP     chrX 154996992-154997492      *
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths
metadata(cA)[[1]][5]
## GRanges object with 1 range and 0 metadata columns:
##       seqnames        ranges strand
##          <Rle>     <IRanges>  <Rle>
##   CLP     chr1 845326-845826      *
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths
metadata(cA)[[1]][8]
## GRanges object with 1 range and 0 metadata columns:
##         seqnames        ranges strand
##            <Rle>     <IRanges>  <Rle>
##   CD4.M     chr1 858370-858870      *
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths

Plot co-accessibility browser tracks

markerGenes  <- c(
    "CD34", #Early Progenitor
    "GATA1", #Erythroid
    "PAX5", "MS4A1", #B-Cell Trajectory
    "CD14", #Monocytes
    "CD3D", "CD8A", "TBX21", "IL7R" #TCells
  )

p <- plotBrowserTrack(
    ArchRProj = projHeme5, 
    groupBy = "Clusters2", 
    geneSymbol = markerGenes, 
    upstream = 50000,
    downstream = 50000,
    loops = getCoAccessibility(projHeme5)
)
## ArchR logging to : ArchRLogs/ArchR-plotBrowserTrack-246057fe6264-Date-2023-01-15_Time-21-56-54.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:56:54 : Validating Region, 0.002 mins elapsed.
## GRanges object with 9 ranges and 2 metadata columns:
##       seqnames              ranges strand |     gene_id      symbol
##          <Rle>           <IRanges>  <Rle> | <character> <character>
##   [1]     chr1 208059883-208084683      - |         947        CD34
##   [2]     chrX   48644982-48652717      + |        2623       GATA1
##   [3]     chr9   36838531-37034476      - |        5079        PAX5
##   [4]    chr11   60223282-60238225      + |         931       MS4A1
##   [5]     chr5 140011313-140013286      - |         929        CD14
##   [6]    chr11 118209789-118213459      - |         915        CD3D
##   [7]     chr2   87011728-87035519      - |         925        CD8A
##   [8]    chr17   45810610-45823485      + |       30009       TBX21
##   [9]     chr5   35856977-35879705      + |        3575        IL7R
##   -------
##   seqinfo: 24 sequences from hg19 genome
## 2023-01-15 21:56:54 : Adding Bulk Tracks (1 of 9), 0.002 mins elapsed.
## 2023-01-15 21:56:55 : Adding Feature Tracks (1 of 9), 0.016 mins elapsed.
## 2023-01-15 21:56:55 : Adding Loop Tracks (1 of 9), 0.017 mins elapsed.
## 2023-01-15 21:56:55 : Adding Gene Tracks (1 of 9), 0.019 mins elapsed.
## 2023-01-15 21:56:55 : Plotting, 0.021 mins elapsed.
## 2023-01-15 21:56:56 : Adding Bulk Tracks (2 of 9), 0.036 mins elapsed.
## 2023-01-15 21:56:57 : Adding Feature Tracks (2 of 9), 0.048 mins elapsed.
## 2023-01-15 21:56:57 : Adding Loop Tracks (2 of 9), 0.049 mins elapsed.
## 2023-01-15 21:56:57 : Adding Gene Tracks (2 of 9), 0.052 mins elapsed.
## 2023-01-15 21:56:57 : Plotting, 0.055 mins elapsed.
## 2023-01-15 21:56:58 : Adding Bulk Tracks (3 of 9), 0.071 mins elapsed.
## 2023-01-15 21:56:59 : Adding Feature Tracks (3 of 9), 0.083 mins elapsed.
## 2023-01-15 21:56:59 : Adding Loop Tracks (3 of 9), 0.084 mins elapsed.
## 2023-01-15 21:56:59 : Adding Gene Tracks (3 of 9), 0.091 mins elapsed.
## 2023-01-15 21:56:59 : Plotting, 0.094 mins elapsed.
## 2023-01-15 21:57:00 : Adding Bulk Tracks (4 of 9), 0.11 mins elapsed.
## 2023-01-15 21:57:01 : Adding Feature Tracks (4 of 9), 0.121 mins elapsed.
## 2023-01-15 21:57:01 : Adding Loop Tracks (4 of 9), 0.122 mins elapsed.
## 2023-01-15 21:57:01 : Adding Gene Tracks (4 of 9), 0.127 mins elapsed.
## 2023-01-15 21:57:02 : Plotting, 0.13 mins elapsed.
## 2023-01-15 21:57:02 : Adding Bulk Tracks (5 of 9), 0.142 mins elapsed.
## 2023-01-15 21:57:03 : Adding Feature Tracks (5 of 9), 0.155 mins elapsed.
## 2023-01-15 21:57:03 : Adding Loop Tracks (5 of 9), 0.156 mins elapsed.
## 2023-01-15 21:57:03 : Adding Gene Tracks (5 of 9), 0.161 mins elapsed.
## 2023-01-15 21:57:04 : Plotting, 0.164 mins elapsed.
## 2023-01-15 21:57:05 : Adding Bulk Tracks (6 of 9), 0.18 mins elapsed.
## 2023-01-15 21:57:05 : Adding Feature Tracks (6 of 9), 0.193 mins elapsed.
## 2023-01-15 21:57:05 : Adding Loop Tracks (6 of 9), 0.194 mins elapsed.
## 2023-01-15 21:57:06 : Adding Gene Tracks (6 of 9), 0.2 mins elapsed.
## 2023-01-15 21:57:06 : Plotting, 0.204 mins elapsed.
## 2023-01-15 21:57:07 : Adding Bulk Tracks (7 of 9), 0.218 mins elapsed.
## 2023-01-15 21:57:08 : Adding Feature Tracks (7 of 9), 0.231 mins elapsed.
## 2023-01-15 21:57:08 : Adding Loop Tracks (7 of 9), 0.232 mins elapsed.
## 2023-01-15 21:57:08 : Adding Gene Tracks (7 of 9), 0.243 mins elapsed.
## 2023-01-15 21:57:09 : Plotting, 0.246 mins elapsed.
## 2023-01-15 21:57:09 : Adding Bulk Tracks (8 of 9), 0.26 mins elapsed.
## 2023-01-15 21:57:10 : Adding Feature Tracks (8 of 9), 0.273 mins elapsed.
## 2023-01-15 21:57:10 : Adding Loop Tracks (8 of 9), 0.273 mins elapsed.
## 2023-01-15 21:57:11 : Adding Gene Tracks (8 of 9), 0.282 mins elapsed.
## 2023-01-15 21:57:11 : Plotting, 0.286 mins elapsed.
## 2023-01-15 21:57:12 : Adding Bulk Tracks (9 of 9), 0.301 mins elapsed.
## 2023-01-15 21:57:13 : Adding Feature Tracks (9 of 9), 0.313 mins elapsed.
## 2023-01-15 21:57:13 : Adding Loop Tracks (9 of 9), 0.314 mins elapsed.
## 2023-01-15 21:57:13 : Adding Gene Tracks (9 of 9), 0.321 mins elapsed.
## 2023-01-15 21:57:13 : Plotting, 0.324 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-plotBrowserTrack-246057fe6264-Date-2023-01-15_Time-21-56-54.log
grid::grid.newpage()
grid::grid.draw(p$CD14)

Peak2GeneLinkage

Correlate peak accessibility to scRNA-seq gene expression

projHeme5 <- addPeak2GeneLinks(
    ArchRProj = projHeme5,
    reducedDims = "IterativeLSI"
)
## ArchR logging to : ArchRLogs/ArchR-addPeak2GeneLinks-246018b5a513-Date-2023-01-15_Time-21-57-15.log
## If there is an issue, please report to github with logFile!
## 2023-01-15 21:57:15 : Getting Available Matrices, 0.002 mins elapsed.
## 2023-01-15 21:57:15 : Filtered Low Prediction Score Cells (1250 of 10250, 0.122), 0.003 mins elapsed.
## 2023-01-15 21:57:15 : Computing KNN, 0.007 mins elapsed.
## 2023-01-15 21:57:15 : Identifying Non-Overlapping KNN pairs, 0.009 mins elapsed.
## 2023-01-15 21:57:19 : Identified 494 Groupings!, 0.073 mins elapsed.
## 2023-01-15 21:57:19 : Getting Group RNA Matrix, 0.074 mins elapsed.
## 2023-01-15 21:58:01 : Getting Group ATAC Matrix, 0.77 mins elapsed.
## 2023-01-15 21:58:42 : Normalizing Group Matrices, 1.45 mins elapsed.
## 2023-01-15 21:58:46 : Finding Peak Gene Pairings, 1.516 mins elapsed.
## 2023-01-15 21:58:46 : Computing Correlations, 1.522 mins elapsed.
## 2023-01-15 21:58:52 : Completed Peak2Gene Correlations!, 1.619 mins elapsed.
## ArchR logging successful to : ArchRLogs/ArchR-addPeak2GeneLinks-246018b5a513-Date-2023-01-15_Time-21-57-15.log
p2g <- getPeak2GeneLinks(
    ArchRProj = projHeme5,
    corCutOff = 0.45,
    resolution = 1,
    returnLoops = FALSE
)
p2g
## DataFrame with 39146 rows and 6 columns
##         idxATAC    idxRNA Correlation         FDR  VarQATAC   VarQRNA
##       <integer> <integer>   <numeric>   <numeric> <numeric> <numeric>
## 1            46         4    0.518798 5.37027e-34  0.708809  0.266115
## 2            44         5    0.617452 1.72141e-51  0.949026  0.763615
## 3            51         5    0.451046 5.69674e-25  0.912092  0.763615
## 4            65         5    0.481414 9.17375e-29  0.667826  0.763615
## 5            79         5    0.566011 1.26131e-41  0.582927  0.763615
## ...         ...       ...         ...         ...       ...       ...
## 39142    143179     18590    0.508216 1.89481e-32  0.494083  0.962798
## 39143    143180     18590    0.574334 4.20739e-43  0.686327  0.962798
## 39144    143198     18590    0.510057 1.02892e-32  0.740555  0.962798
## 39145    143198     18591    0.481032 1.02954e-28  0.740555  0.953981
## 39146    143217     18595    0.591408 2.85330e-46  0.716524  0.447718
metadata(p2g)[[1]]
## GRanges object with 143229 ranges and 0 metadata columns:
##            seqnames              ranges strand
##               <Rle>           <IRanges>  <Rle>
##        [1]     chr1       752472-752972      *
##        [2]     chr1       762688-763188      *
##        [3]     chr1       801007-801507      *
##        [4]     chr1       805039-805539      *
##        [5]     chr1       845326-845826      *
##        ...      ...                 ...    ...
##   [143225]     chrX 154841576-154842076      *
##   [143226]     chrX 154842394-154842894      *
##   [143227]     chrX 154912373-154912873      *
##   [143228]     chrX 154996185-154996685      *
##   [143229]     chrX 154996992-154997492      *
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths
metadata(p2g)
## $peakSet
## GRanges object with 143229 ranges and 0 metadata columns:
##            seqnames              ranges strand
##               <Rle>           <IRanges>  <Rle>
##        [1]     chr1       752472-752972      *
##        [2]     chr1       762688-763188      *
##        [3]     chr1       801007-801507      *
##        [4]     chr1       805039-805539      *
##        [5]     chr1       845326-845826      *
##        ...      ...                 ...    ...
##   [143225]     chrX 154841576-154842076      *
##   [143226]     chrX 154842394-154842894      *
##   [143227]     chrX 154912373-154912873      *
##   [143228]     chrX 154996185-154996685      *
##   [143229]     chrX 154996992-154997492      *
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths
## 
## $geneSet
## GRanges object with 18601 ranges and 2 metadata columns:
##           seqnames    ranges strand |      name     idx
##              <Rle> <IRanges>  <Rle> |   <array> <array>
##       [1]     chr1     69091      * |     OR4F5       1
##       [2]     chr1    762902      * | LINC00115       2
##       [3]     chr1    812182      * |    FAM41C       3
##       [4]     chr1    860530      * |    SAMD11       4
##       [5]     chr1    894679      * |     NOC2L       5
##       ...      ...       ...    ... .       ...     ...
##   [18597]     chrX 154299695      * |     BRCC3     708
##   [18598]     chrX 154444701      * |      VBP1     709
##   [18599]     chrX 154493852      * |    RAB39B     710
##   [18600]     chrX 154563986      * |     CLIC2     711
##   [18601]     chrX 154842622      * |     TMLHE     712
##   -------
##   seqinfo: 23 sequences from an unspecified genome; no seqlengths
## 
## $seATAC
## [1] "/Users/nzhang/Desktop/ArchR/HemeOutput/Peak2GeneLinks/seATAC-Group-KNN.rds"
## 
## $seRNA
## [1] "/Users/nzhang/Desktop/ArchR/HemeOutput/Peak2GeneLinks/seRNA-Group-KNN.rds"

Identify TF regulators by combining chromVAR with scRNA-seq

Identify the TF whose gene expression is closely correlated with its motif enrichment (MotifMatrix) and its motif enrichment is significantly different across cell clusters.

MotifMatrix of chromVAR deviations average by clusters

seGroupMotif <- getGroupSE(ArchRProj = projHeme5, useMatrix = "MotifMatrix", groupBy = "Clusters2")
## ArchR logging to : ArchRLogs/ArchR-getGroupSE-24601e215cbd-Date-2023-01-15_Time-22-00-40.log
## If there is an issue, please report to github with logFile!
## Getting Group Matrix
## 2023-01-15 22:00:44 : Successfully Created Group Matrix, 0.076 mins elapsed.
## Normalizing by number of Cells
## ArchR logging successful to : ArchRLogs/ArchR-getGroupSE-24601e215cbd-Date-2023-01-15_Time-22-00-40.log
seGroupMotif
## class: SummarizedExperiment 
## dim: 1740 11 
## metadata(0):
## assays(1): MotifMatrix
## rownames(1740): f1 f2 ... f1739 f1740
## rowData names(3): seqnames idx name
## colnames(11): B CD4.M ... PreB Progenitor
## colData names(20): TSSEnrichment ReadsInTSS ... FRIP nCells

seGroupMotif has two seqnames: 1. “deviations”: raw deviation. 2. “z”: deviation z-score.

Only use the z scores here.

seZ <- seGroupMotif[rowData(seGroupMotif)$seqnames=="z",]
seZ
## class: SummarizedExperiment 
## dim: 870 11 
## metadata(0):
## assays(1): MotifMatrix
## rownames(870): f871 f872 ... f1739 f1740
## rowData names(3): seqnames idx name
## colnames(11): B CD4.M ... PreB Progenitor
## colData names(20): TSSEnrichment ReadsInTSS ... FRIP nCells
head(assay(seZ))
##                B      CD4.M      CD4.N          CLP  Erythroid        GMP
## f871  0.01461606 -0.9943487 -0.5025861  0.957349313  0.1787210  0.5184027
## f872  0.30592774  0.4197083  0.1772401 -0.524613963 -0.3222590 -0.4127469
## f873 -0.38627583 -1.8891104 -1.2269646  1.228653666  0.3684630  0.8732100
## f874 -0.10525071 -0.3571070 -0.2790384  0.308071137  0.1053562  0.1739950
## f875 -0.03003612 -0.2632663 -0.1113669 -0.007475132  0.1081030  0.1332870
## f876  0.68745234  1.7747323  0.5618254  0.250581521 -0.1969163 -0.6797505
##              Mono         NK         pDC         PreB  Progenitor
## f871 -0.003033827 -0.6622329  0.38316801  0.736031634  0.47999646
## f872  0.270210411  0.4140534 -0.39944839 -0.009954809 -0.57034139
## f873  0.501844552 -1.4345311  0.49963001  0.694572857  0.79096865
## f874  0.052037375 -0.2472815  0.04957727  0.213642380  0.14895770
## f875  0.023047875 -0.2035694  0.17832704  0.144655879  0.09716411
## f876 -0.681517316  1.1494191 -0.32855911  0.480618906 -0.19929050
head(rowData(seZ))
## DataFrame with 6 rows and 3 columns
##      seqnames     idx     name
##         <Rle> <array>  <array>
## f871        z       1 TFAP2B_1
## f872        z       2 TFAP2D_2
## f873        z       3 TFAP2C_3
## f874        z       4 TFAP2E_4
## f875        z       5 TFAP2A_5
## f876        z       6 ARID3A_6
head(colData(seZ))
## DataFrame with 6 rows and 20 columns
##           TSSEnrichment ReadsInTSS ReadsInPromoter ReadsInBlacklist
##               <numeric>  <numeric>       <numeric>        <numeric>
## B               18.2345      361.5          1312.0               47
## CD4.M           21.4990     1020.0          3492.0               82
## CD4.N           18.2970      349.0          1249.0               44
## CLP             15.6290      362.0          1294.0               48
## Erythroid       15.9770      487.0          1710.0               57
## GMP             13.9005      469.5          1712.5               64
##           PromoterRatio    PassQC NucleosomeRatio nMultiFrags nMonoFrags
##               <numeric> <numeric>       <numeric>   <numeric>  <numeric>
## B              0.303396         1        1.234426         348      954.5
## CD4.M          0.429245         1        1.016743         633     2001.0
## CD4.N          0.326771         1        1.438531         314      785.5
## CLP            0.254557         1        0.517127         167     1547.0
## Erythroid      0.260774         1        0.500755         199     2077.0
## GMP            0.231532         1        0.567286         221     2106.0
##              nFrags  nDiFrags DoubletScore DoubletEnrichment BlacklistRatio
##           <numeric> <numeric>    <numeric>         <numeric>      <numeric>
## B            2179.0     852.5      0.00000               0.9     0.01046523
## CD4.M        4058.0    1377.0      0.15881               1.6     0.00992282
## CD4.N        1912.0     803.0      0.00000               0.6     0.01103220
## CLP          2633.0     797.0      0.00000               0.6     0.00923742
## Erythroid    3258.0     969.0      0.00000               1.4     0.00865860
## GMP          3774.5    1166.5      2.38853               1.8     0.00867212
##           predictedScore_Un predictedScore_Co predictedScore ReadsInPeaks
##                   <numeric>         <numeric>      <numeric>    <numeric>
## B                  1.000000          1.000000       1.000000       2286.0
## CD4.M              0.392271          0.604815       0.604815       4888.0
## CD4.N              0.403471          0.432651       0.432651       1890.0
## CLP                0.753240          0.771535       0.771535       2532.0
## Erythroid          0.583012          0.630343       0.630343       3483.0
## GMP                0.732071          0.483545       0.483545       3846.5
##                FRIP    nCells
##           <numeric> <integer>
## B          0.530171       420
## CD4.M      0.609881       815
## CD4.N      0.500623      1242
## CLP        0.520265       385
## Erythroid  0.536694       899
## GMP        0.554268      1140

Identify the maximum delta in z-score between all clusters

rowData(seZ)$maxDelta <- lapply(seq_len(ncol(seZ)), function(x){
  rowMaxs(assay(seZ) - assay(seZ)[,x])
}) %>% Reduce("cbind", .) %>% rowMaxs
head(rowData(seZ))
## DataFrame with 6 rows and 4 columns
##      seqnames     idx     name  maxDelta
##         <Rle> <array>  <array> <numeric>
## f871        z       1 TFAP2B_1  1.951698
## f872        z       2 TFAP2D_2  0.990050
## f873        z       3 TFAP2C_3  3.117764
## f874        z       4 TFAP2E_4  0.665178
## f875        z       5 TFAP2A_5  0.441593
## f876        z       6 ARID3A_6  2.456250

Calculate the correlation between MotifMatrix and Gene score/ integrated expression

corGSM_MM <- correlateMatrices(
    ArchRProj = projHeme5,
    useMatrix1 = "GeneScoreMatrix",
    useMatrix2 = "MotifMatrix",
    reducedDims = "IterativeLSI"
)
## ArchR logging to : ArchRLogs/ArchR-correlateMatrices-246018e1fcdf-Date-2023-01-15_Time-22-00-46.log
## If there is an issue, please report to github with logFile!
## When accessing features from a matrix of class Sparse.Assays.Matrix it requires 1 seqname!
## Continuing with first seqname 'z'!
## If confused, try getFeatures(ArchRProj, 'MotifMatrix') to list out available seqnames for input!
## 2023-01-15 22:00:48 : Testing 825 Mappings!, 0.032 mins elapsed.
## 2023-01-15 22:00:48 : Computing KNN, 0.032 mins elapsed.
## 2023-01-15 22:00:48 : Identifying Non-Overlapping KNN pairs, 0.034 mins elapsed.
## 2023-01-15 22:00:52 : Identified 494 Groupings!, 0.093 mins elapsed.
## 2023-01-15 22:00:52 : Getting Group Matrix 1, 0.104 mins elapsed.
## 2023-01-15 22:01:09 : Getting Group Matrix 2, 0.371 mins elapsed.
## Some entries in groupMat2 are less than 0, continuing without Log2 Normalization.
## Most likely this assay is a deviations matrix.
## Getting Correlations...
## 2023-01-15 22:01:15 :
## Computing Correlation (250 of 825)
## Computing Correlation (500 of 825)
## Computing Correlation (750 of 825)
## ArchR logging successful to : ArchRLogs/ArchR-correlateMatrices-246018e1fcdf-Date-2023-01-15_Time-22-00-46.log
head(corGSM_MM)
## DataFrame with 6 rows and 14 columns
##   GeneScoreMatrix_name MotifMatrix_name       cor        padj        pval
##            <character>      <character> <numeric>   <numeric>   <numeric>
## 1                 HES4          HES4_95  0.120757 1.00000e+00 7.20970e-03
## 2                 HES5          HES5_98  0.393830 7.28409e-17 8.90476e-20
## 3               PRDM16       PRDM16_211  0.348553 1.20739e-12 1.47603e-15
## 4                 TP73         TP73_705  0.571206 3.28424e-41 4.01497e-44
## 5             TP73-AS1         TP73_705 -0.164659 1.94122e-01 2.37313e-04
## 6                 HES2          HES2_19 -0.355364 3.08343e-13 3.76948e-16
##   GeneScoreMatrix_seqnames GeneScoreMatrix_start GeneScoreMatrix_end
##                <character>             <integer>           <integer>
## 1                     chr1                935552              934342
## 2                     chr1               2461684             2460184
## 3                     chr1               2985742             3355185
## 4                     chr1               3569129             3652765
## 5                     chr1               3663937             3652548
## 6                     chr1               6484730             6472498
##   GeneScoreMatrix_strand GeneScoreMatrix_idx GeneScoreMatrix_matchName
##                <integer>           <integer>               <character>
## 1                      2                  15                      HES4
## 2                      2                  74                      HES5
## 3                      1                  82                    PRDM16
## 4                      1                  89                      TP73
## 5                      2                  90                      TP73
## 6                      2                 111                      HES2
##   MotifMatrix_seqnames MotifMatrix_idx MotifMatrix_matchName
##            <character>       <integer>           <character>
## 1                    z              95                  HES4
## 2                    z              98                  HES5
## 3                    z             211                PRDM16
## 4                    z             705                  TP73
## 5                    z             705                  TP73
## 6                    z              19                  HES2
corGIM_MM <- correlateMatrices(
    ArchRProj = projHeme5,
    useMatrix1 = "GeneIntegrationMatrix",
    useMatrix2 = "MotifMatrix",
    reducedDims = "IterativeLSI"
)
## ArchR logging to : ArchRLogs/ArchR-correlateMatrices-246063ab5f5f-Date-2023-01-15_Time-22-01-15.log
## If there is an issue, please report to github with logFile!
## When accessing features from a matrix of class Sparse.Assays.Matrix it requires 1 seqname!
## Continuing with first seqname 'z'!
## If confused, try getFeatures(ArchRProj, 'MotifMatrix') to list out available seqnames for input!
## 2023-01-15 22:01:17 : Testing 798 Mappings!, 0.03 mins elapsed.
## 2023-01-15 22:01:17 : Computing KNN, 0.03 mins elapsed.
## 2023-01-15 22:01:17 : Identifying Non-Overlapping KNN pairs, 0.032 mins elapsed.
## 2023-01-15 22:01:21 : Identified 494 Groupings!, 0.096 mins elapsed.
## 2023-01-15 22:01:22 : Getting Group Matrix 1, 0.106 mins elapsed.
## 2023-01-15 22:01:39 : Getting Group Matrix 2, 0.392 mins elapsed.
## Some entries in groupMat2 are less than 0, continuing without Log2 Normalization.
## Most likely this assay is a deviations matrix.
## Getting Correlations...
## 2023-01-15 22:01:45 :
## Computing Correlation (250 of 798)
## Computing Correlation (500 of 798)
## Computing Correlation (750 of 798)
## ArchR logging successful to : ArchRLogs/ArchR-correlateMatrices-246063ab5f5f-Date-2023-01-15_Time-22-01-15.log
head(corGIM_MM)
## DataFrame with 6 rows and 14 columns
##   GeneIntegrationMatrix_name MotifMatrix_name       cor        padj        pval
##                  <character>      <character> <numeric>   <numeric>   <numeric>
## 1                       HES4          HES4_95 -0.158247 2.20176e-01 4.14644e-04
## 2                       HES5          HES5_98 -0.251397 7.82291e-06 1.47324e-08
## 3                     PRDM16       PRDM16_211  0.184262 2.00843e-02 3.78236e-05
## 4                       TP73         TP73_705  0.114584 1.00000e+00 1.08123e-02
## 5                       HES2          HES2_19 -0.381744 7.35545e-16 1.38521e-18
## 6                       ENO1         ENO1_803  0.282464 8.64839e-08 1.62870e-10
##   GeneIntegrationMatrix_seqnames GeneIntegrationMatrix_start
##                      <character>                   <integer>
## 1                           chr1                      935552
## 2                           chr1                     2461684
## 3                           chr1                     2985742
## 4                           chr1                     3569129
## 5                           chr1                     6484730
## 6                           chr1                     8939151
##   GeneIntegrationMatrix_end GeneIntegrationMatrix_strand
##                   <integer>                    <integer>
## 1                    934342                            2
## 2                   2460184                            2
## 3                   3355185                            1
## 4                   3652765                            1
## 5                   6472498                            2
## 6                   8921059                            2
##   GeneIntegrationMatrix_idx GeneIntegrationMatrix_matchName
##                   <integer>                     <character>
## 1                         8                            HES4
## 2                        53                            HES5
## 3                        59                          PRDM16
## 4                        64                            TP73
## 5                        81                            HES2
## 6                       101                            ENO1
##   MotifMatrix_seqnames MotifMatrix_idx MotifMatrix_matchName
##            <character>       <integer>           <character>
## 1                    z              95                  HES4
## 2                    z              98                  HES5
## 3                    z             211                PRDM16
## 4                    z             705                  TP73
## 5                    z              19                  HES2
## 6                    z             803                  ENO1

Add MotifMatrix max delta to the correlation data frame

corGSM_MM$maxDelta <- rowData(seZ)[match(corGSM_MM$MotifMatrix_name, rowData(seZ)$name), "maxDelta"]
corGIM_MM$maxDelta <- rowData(seZ)[match(corGIM_MM$MotifMatrix_name, rowData(seZ)$name), "maxDelta"]

Identify positive TF regulators

corGSM_MM <- corGSM_MM[order(abs(corGSM_MM$cor), decreasing = TRUE), ]
corGSM_MM <- corGSM_MM[which(!duplicated(gsub("\\-.*","",corGSM_MM[,"MotifMatrix_name"]))), ]
corGSM_MM$TFRegulator <- "NO"
corGSM_MM$TFRegulator[which(corGSM_MM$cor > 0.5 & corGSM_MM$padj < 0.01 & corGSM_MM$maxDelta > quantile(corGSM_MM$maxDelta, 0.75))] <- "YES"
sort(corGSM_MM[corGSM_MM$TFRegulator=="YES",1])
##  [1] "ARNTL"    "ASCL1"    "ATF1"     "ATOH1"    "ATOH8"    "BCL11A"  
##  [7] "CEBPA-DT" "CEBPB"    "CEBPD"    "CREB1"    "CREB3L4"  "EBF1"    
## [13] "EGR2"     "EOMES"    "ERF"      "ETS1"     "FUBP1"    "GABPA"   
## [19] "GATA1"    "GATA2"    "GATA5"    "IRF1"     "JDP2"     "KLF11"   
## [25] "KLF13"    "KLF2"     "LEF1"     "LYL1"     "MITF"     "NEUROD1" 
## [31] "NFE2"     "NFIA"     "NFIC"     "NFIX"     "NHLH1"    "PRDM1"   
## [37] "RUNX2"    "SIX5"     "SMAD1"    "SP4"      "SPI1"     "SPIB"    
## [43] "TAL1"     "TCF15"    "TFAP2C"   "TWIST1"   "YY1"
ggplot(data.frame(corGSM_MM), aes(cor, maxDelta, color = TFRegulator)) +
  geom_point() + 
  theme_ArchR() +
  geom_vline(xintercept = 0, lty = "dashed") + 
  scale_color_manual(values = c("NO"="darkgrey", "YES"="firebrick3")) +
  xlab("Correlation To Gene Score") +
  ylab("Max TF Motif Delta") +
  scale_y_continuous(
    expand = c(0,0), 
    limits = c(0, max(corGSM_MM$maxDelta)*1.05)
  )
## Warning: Removed 7 rows containing missing values (`geom_point()`).

corGIM_MM <- corGIM_MM[order(abs(corGIM_MM$cor), decreasing = TRUE), ]
corGIM_MM <- corGIM_MM[which(!duplicated(gsub("\\-.*","",corGIM_MM[,"MotifMatrix_name"]))), ]
corGIM_MM$TFRegulator <- "NO"
corGIM_MM$TFRegulator[which(corGIM_MM$cor > 0.5 & corGIM_MM$padj < 0.01 & corGIM_MM$maxDelta > quantile(corGIM_MM$maxDelta, 0.75))] <- "YES"
sort(corGIM_MM[corGIM_MM$TFRegulator=="YES",1])
##  [1] "ARNTL" "BACH1" "CEBPA" "CEBPB" "CEBPD" "CEBPE" "CEBPG" "CTCF"  "EBF1" 
## [10] "ELK1"  "EOMES" "ETS1"  "FOS"   "FOSL1" "FOSL2" "GATA1" "GATA2" "IRF1" 
## [19] "IRF2"  "IRF9"  "JDP2"  "KLF13" "KLF2"  "LEF1"  "MITF"  "NFE2"  "NFIA" 
## [28] "NFIB"  "NFIX"  "NFKB2" "NR4A1" "NRF1"  "RUNX1" "RUNX2" "SMAD1" "SPI1" 
## [37] "STAT2" "TAL1"  "TCF12" "TCF3"  "TCF4"  "TEF"
ggplot(data.frame(corGIM_MM), aes(cor, maxDelta, color = TFRegulator)) +
  geom_point() + 
  theme_ArchR() +
  geom_vline(xintercept = 0, lty = "dashed") + 
  scale_color_manual(values = c("NO"="darkgrey", "YES"="firebrick3")) +
  xlab("Correlation To Gene Expression") +
  ylab("Max TF Motif Delta") +
  scale_y_continuous(
    expand = c(0,0), 
    limits = c(0, max(corGIM_MM$maxDelta)*1.05)
  )
## Warning: Removed 251 rows containing missing values (`geom_point()`).